1use super::blockers::{
2 block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
3 xhr::IGNORE_XHR_ASSETS,
4};
5use crate::auth::Credentials;
6#[cfg(feature = "_cache")]
7use crate::cache::BasicCachePolicy;
8use crate::cmd::CommandChain;
9use crate::handler::http::HttpRequest;
10use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
11use aho_corasick::AhoCorasick;
12use case_insensitive_string::CaseInsensitiveString;
13use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
14use chromiumoxide_cdp::cdp::browser_protocol::network::{
15 EmulateNetworkConditionsParams, EventLoadingFailed, EventLoadingFinished,
16 EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
17 InterceptionId, RequestId, ResourceType, Response, SetCacheDisabledParams,
18 SetExtraHttpHeadersParams,
19};
20use chromiumoxide_cdp::cdp::browser_protocol::{
21 fetch::{
22 self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
23 ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
24 },
25 network::SetBypassServiceWorkerParams,
26};
27use chromiumoxide_cdp::cdp::browser_protocol::{
28 network::EnableParams, security::SetIgnoreCertificateErrorsParams,
29};
30use chromiumoxide_types::{Command, Method, MethodId};
31use hashbrown::{HashMap, HashSet};
32use lazy_static::lazy_static;
33use reqwest::header::PROXY_AUTHORIZATION;
34use spider_network_blocker::intercept_manager::NetworkInterceptManager;
35pub use spider_network_blocker::scripts::{
36 URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
37};
38use std::borrow::Cow;
39use std::collections::VecDeque;
40use std::time::Duration;
41
42lazy_static! {
43 static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
45 "jquery", "angular",
47 "react", "vue", "bootstrap",
50 "d3",
51 "lodash",
52 "ajax",
53 "application",
54 "app", "main",
56 "index",
57 "bundle",
58 "vendor",
59 "runtime",
60 "polyfill",
61 "scripts",
62 "es2015.",
63 "es2020.",
64 "webpack",
65 "captcha",
66 "client",
67 "/cdn-cgi/challenge-platform/",
68 "/wp-content/js/", "https://m.stripe.network/",
71 "https://challenges.cloudflare.com/",
72 "https://www.google.com/recaptcha/",
73 "https://google.com/recaptcha/api.js",
74 "https://www.gstatic.com/recaptcha/",
75 "https://captcha.px-cloud.net/",
76 "https://geo.captcha-delivery.com/",
77 "https://api.leminnow.com/captcha/",
78 "https://cdn.auth0.com/js/lock/",
79 "https://captcha.gtimg.com",
80 "https://client-api.arkoselabs.com/",
81 "https://www.capy.me/puzzle/",
82 "https://newassets.hcaptcha.com/",
83 "https://cdn.auth0.com/client",
84 "https://js.stripe.com/",
85 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-"
88 ];
89
90 pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
95
96 static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
98 "https://m.stripe.network/",
100 "https://challenges.cloudflare.com/",
101 "https://js.stripe.com/",
102 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-",
105 "https://ct.captcha-delivery.com/",
106 "https://geo.captcha-delivery.com/",
107 "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", "https://cdn.auth0.com/client",
109 "https://captcha.px-cloud.net/",
110 "https://www.capy.me/puzzle/",
111 "https://www.gstatic.com/recaptcha/",
112 "https://google.com/recaptcha/",
113 "https://www.google.com/recaptcha/",
114 "https://www.recaptcha.net/recaptcha/",
115 "https://js.hcaptcha.com/1/api.js",
116 "https://hcaptcha.com/1/api.js",
117 "https://js.datadome.co/tags.js",
118 "https://api-js.datadome.co/",
119 "https://client.perimeterx.net/",
120 "https://captcha.px-cdn.net/",
121 "https://newassets.hcaptcha.com/",
122 "https://captcha.px-cloud.net/",
123 "https://s.perimeterx.net/",
124 "https://api.leminnow.com/captcha/",
125 "https://client-api.arkoselabs.com/",
126 "https://static.geetest.com/v4/gt4.js",
127 "https://static.geetest.com/",
128 "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
129 "https://cdn.perfdrive.com/aperture/",
130 "https://assets.queue-it.net/",
131 "discourse-cdn.com/",
132 "hcaptcha.com",
133 "/cdn-cgi/challenge-platform/",
134 "/_Incapsula_Resource"
135 ];
136
137 pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
139
140 pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
142 phf::phf_set! {
143 "_astro/", "_app/immutable"
145 }
146 };
147
148 pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
150 "application/pdf",
151 "application/zip",
152 "application/x-rar-compressed",
153 "application/x-tar",
154 "image/png",
155 "image/jpeg",
156 "image/gif",
157 "image/bmp",
158 "image/webp",
159 "image/svg+xml",
160 "video/mp4",
161 "video/x-msvideo",
162 "video/x-matroska",
163 "video/webm",
164 "audio/mpeg",
165 "audio/ogg",
166 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
167 "application/vnd.ms-excel",
168 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
169 "application/vnd.ms-powerpoint",
170 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
171 "application/x-7z-compressed",
172 "application/x-rpm",
173 "application/x-shockwave-flash",
174 "application/rtf",
175 };
176
177 pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
179 "Image",
180 "Media",
181 "Font"
182 };
183
184 pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
186 "CspViolationReport",
187 "Other",
188 "Prefetch",
189 "Ping",
190 };
191
192 pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
194
195 pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
197 let enable = EnableParams::default();
198
199 if let Ok(c) = serde_json::to_value(&enable) {
200 vec![(enable.identifier(), c)]
201 } else {
202 vec![]
203 }
204 };
205
206 pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
208 let enable = EnableParams::default();
209 let mut v = vec![];
210 if let Ok(c) = serde_json::to_value(&enable) {
211 v.push((enable.identifier(), c));
212 }
213 let ignore = SetIgnoreCertificateErrorsParams::new(true);
214 if let Ok(ignored) = serde_json::to_value(&ignore) {
215 v.push((ignore.identifier(), ignored));
216 }
217
218 v
219 };
220
221 pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
223 fetch::EnableParams::builder()
224 .handle_auth_requests(true)
225 .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
226 .build()
227 };
228}
229
230pub(crate) fn is_redirect_status(status: i64) -> bool {
232 matches!(status, 301 | 302 | 303 | 307 | 308)
233}
234
235#[derive(Debug)]
236pub struct NetworkManager {
238 queued_events: VecDeque<NetworkEvent>,
244 ignore_httpserrors: bool,
249 requests: HashMap<RequestId, HttpRequest>,
254 requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
261 extra_headers: std::collections::HashMap<String, String>,
266 request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
272 user_cache_disabled: bool,
277 attempted_authentications: HashSet<RequestId>,
283 credentials: Option<Credentials>,
288 pub(crate) user_request_interception_enabled: bool,
297 block_all: bool,
304 pub(crate) protocol_request_interception_enabled: bool,
310 offline: bool,
312 pub request_timeout: Duration,
314 pub ignore_visuals: bool,
317 pub block_stylesheets: bool,
319 pub block_javascript: bool,
324 pub block_analytics: bool,
326 pub only_html: bool,
328 pub xml_document: bool,
330 pub intercept_manager: NetworkInterceptManager,
332 pub document_reload_tracker: u8,
334 pub document_target_url: String,
336 pub document_target_domain: String,
338 pub max_bytes_allowed: Option<u64>,
340 #[cfg(feature = "_cache")]
341 pub cache_site_key: Option<String>,
343 #[cfg(feature = "_cache")]
345 pub cache_policy: Option<BasicCachePolicy>,
346 whitelist_patterns: Vec<String>,
348 whitelist_matcher: Option<AhoCorasick>,
350 blacklist_patterns: Vec<String>,
352 blacklist_matcher: Option<AhoCorasick>,
354 blacklist_strict: bool,
356}
357
358impl NetworkManager {
359 pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
361 Self {
362 queued_events: Default::default(),
363 ignore_httpserrors,
364 requests: Default::default(),
365 requests_will_be_sent: Default::default(),
366 extra_headers: Default::default(),
367 request_id_to_interception_id: Default::default(),
368 user_cache_disabled: false,
369 attempted_authentications: Default::default(),
370 credentials: None,
371 block_all: false,
372 user_request_interception_enabled: false,
373 protocol_request_interception_enabled: false,
374 offline: false,
375 request_timeout,
376 ignore_visuals: false,
377 block_javascript: false,
378 block_stylesheets: false,
379 block_analytics: true,
380 only_html: false,
381 xml_document: false,
382 intercept_manager: NetworkInterceptManager::Unknown,
383 document_reload_tracker: 0,
384 document_target_url: String::new(),
385 document_target_domain: String::new(),
386 whitelist_patterns: Vec::new(),
387 whitelist_matcher: None,
388 blacklist_patterns: Vec::new(),
389 blacklist_matcher: None,
390 blacklist_strict: true,
391 max_bytes_allowed: None,
392 #[cfg(feature = "_cache")]
393 cache_site_key: None,
394 #[cfg(feature = "_cache")]
395 cache_policy: None,
396 }
397 }
398
399 pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
401 where
402 I: IntoIterator<Item = S>,
403 S: Into<String>,
404 {
405 self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
406 self.rebuild_whitelist_matcher();
407 }
408
409 pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
411 where
412 I: IntoIterator<Item = S>,
413 S: Into<String>,
414 {
415 self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
416 self.rebuild_blacklist_matcher();
417 }
418
419 pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
421 self.blacklist_patterns.push(pattern.into());
422 self.rebuild_blacklist_matcher();
423 }
424
425 pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
427 where
428 I: IntoIterator<Item = S>,
429 S: Into<String>,
430 {
431 self.blacklist_patterns
432 .extend(patterns.into_iter().map(Into::into));
433 self.rebuild_blacklist_matcher();
434 }
435
436 pub fn clear_blacklist(&mut self) {
438 self.blacklist_patterns.clear();
439 self.blacklist_matcher = None;
440 }
441
442 pub fn set_blacklist_strict(&mut self, strict: bool) {
444 self.blacklist_strict = strict;
445 }
446
447 #[inline]
448 fn rebuild_blacklist_matcher(&mut self) {
449 if self.blacklist_patterns.is_empty() {
450 self.blacklist_matcher = None;
451 return;
452 }
453
454 let refs: Vec<&str> = self.blacklist_patterns.iter().map(|s| s.as_str()).collect();
455 self.blacklist_matcher = AhoCorasick::new(refs).ok();
456 }
457
458 #[inline]
459 fn is_blacklisted(&self, url: &str) -> bool {
460 self.blacklist_matcher
461 .as_ref()
462 .map(|m| m.is_match(url))
463 .unwrap_or(false)
464 }
465
466 pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
468 self.whitelist_patterns.push(pattern.into());
469 self.rebuild_whitelist_matcher();
470 }
471
472 pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
474 where
475 I: IntoIterator<Item = S>,
476 S: Into<String>,
477 {
478 self.whitelist_patterns
479 .extend(patterns.into_iter().map(Into::into));
480 self.rebuild_whitelist_matcher();
481 }
482
483 #[inline]
484 fn rebuild_whitelist_matcher(&mut self) {
485 if self.whitelist_patterns.is_empty() {
486 self.whitelist_matcher = None;
487 return;
488 }
489
490 let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
491
492 self.whitelist_matcher = AhoCorasick::new(refs).ok();
494 }
495
496 #[inline]
497 fn is_whitelisted(&self, url: &str) -> bool {
498 self.whitelist_matcher
499 .as_ref()
500 .map(|m| m.is_match(url))
501 .unwrap_or(false)
502 }
503
504 pub fn init_commands(&self) -> CommandChain {
506 let cmds = if self.ignore_httpserrors {
507 INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
508 } else {
509 INIT_CHAIN.clone()
510 };
511 CommandChain::new(cmds, self.request_timeout)
512 }
513
514 pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
516 let method = cmd.identifier();
517 if let Ok(params) = serde_json::to_value(cmd) {
518 self.queued_events
519 .push_back(NetworkEvent::SendCdpRequest((method, params)));
520 }
521 }
522
523 pub fn poll(&mut self) -> Option<NetworkEvent> {
525 self.queued_events.pop_front()
526 }
527
528 pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
530 &self.extra_headers
531 }
532
533 pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
535 self.extra_headers = headers;
536 self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
537 self.extra_headers.remove("Proxy-Authorization");
538 if !self.extra_headers.is_empty() {
539 if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
540 self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
541 }
542 }
543 }
544
545 pub fn set_service_worker_enabled(&mut self, bypass: bool) {
546 self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
547 }
548
549 pub fn set_block_all(&mut self, block_all: bool) {
550 self.block_all = block_all;
551 }
552
553 pub fn set_request_interception(&mut self, enabled: bool) {
554 self.user_request_interception_enabled = enabled;
555 self.update_protocol_request_interception();
556 }
557
558 pub fn set_cache_enabled(&mut self, enabled: bool) {
559 let run = self.user_cache_disabled != !enabled;
560 self.user_cache_disabled = !enabled;
561 if run {
562 self.update_protocol_cache_disabled();
563 }
564 }
565
566 pub fn enable_request_intercept(&mut self) {
568 self.protocol_request_interception_enabled = true;
569 }
570
571 pub fn disable_request_intercept(&mut self) {
573 self.protocol_request_interception_enabled = false;
574 }
575
576 #[cfg(feature = "_cache")]
578 pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
579 self.cache_site_key = cache_site_key;
580 }
581
582 #[cfg(feature = "_cache")]
584 pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
585 self.cache_policy = cache_policy;
586 }
587
588 pub fn update_protocol_cache_disabled(&mut self) {
589 self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
590 }
591
592 pub fn authenticate(&mut self, credentials: Credentials) {
593 self.credentials = Some(credentials);
594 self.update_protocol_request_interception();
595 self.protocol_request_interception_enabled = true;
596 }
597
598 fn update_protocol_request_interception(&mut self) {
599 let enabled = self.user_request_interception_enabled || self.credentials.is_some();
600
601 if enabled == self.protocol_request_interception_enabled {
602 return;
603 }
604
605 if enabled {
606 self.push_cdp_request(ENABLE_FETCH.clone())
607 } else {
608 self.push_cdp_request(DisableParams::default())
609 }
610 }
611
612 #[inline]
615 fn should_block_script_blocklist_only(&self, url: &str) -> bool {
616 let block_analytics = self.block_analytics;
618
619 if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
621 {
622 return true;
623 }
624
625 if crate::handler::blockers::block_websites::block_website(url) {
627 return true;
628 }
629
630 if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
637 let p_slash = Self::strip_query_fragment(path_with_slash);
639 let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
640
641 let base = match p_slash.rsplit('/').next() {
643 Some(b) => b,
644 None => p_slash,
645 };
646
647 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
650 return true;
651 }
652 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
653 return true;
654 }
655 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
656 return true;
657 }
658
659 if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
662 return true;
663 }
664
665 if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
667 return true;
668 }
669 }
670
671 false
672 }
673
674 #[inline]
679 fn url_path_with_leading_slash<'a>(url: &'a str) -> Option<&'a str> {
680 let idx = url.find("//")?;
682 let after_slashes = idx + 2;
683
684 let slash_rel = url[after_slashes..].find('/')?;
686 let slash_idx = after_slashes + slash_rel;
687
688 if slash_idx < url.len() {
689 Some(&url[slash_idx..])
690 } else {
691 None
692 }
693 }
694
695 #[inline]
700 fn strip_query_fragment(s: &str) -> &str {
701 let q = s.find('?');
702 let h = s.find('#');
703
704 match (q, h) {
705 (None, None) => s,
706 (Some(i), None) => &s[..i],
707 (None, Some(i)) => &s[..i],
708 (Some(i), Some(j)) => &s[..i.min(j)],
709 }
710 }
711
712 #[inline]
714 fn skip_xhr(
715 &self,
716 skip_networking: bool,
717 event: &EventRequestPaused,
718 network_event: bool,
719 ) -> bool {
720 if !skip_networking && network_event {
722 let request_url = event.request.url.as_str();
723
724 let skip_analytics =
726 self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
727
728 if skip_analytics {
729 true
730 } else if self.block_stylesheets || self.ignore_visuals {
731 let block_css = self.block_stylesheets;
732 let block_media = self.ignore_visuals;
733
734 let mut block_request = false;
735
736 if let Some(position) = request_url.rfind('.') {
737 let hlen = request_url.len();
738 let has_asset = hlen - position;
739
740 if has_asset >= 3 {
741 let next_position = position + 1;
742
743 if block_media
744 && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
745 &request_url[next_position..].into(),
746 )
747 {
748 block_request = true;
749 } else if block_css {
750 block_request =
751 CaseInsensitiveString::from(request_url[next_position..].as_bytes())
752 .contains(&**CSS_EXTENSION)
753 }
754 }
755 }
756
757 if !block_request {
758 block_request = ignore_script_xhr_media(request_url);
759 }
760
761 block_request
762 } else {
763 skip_networking
764 }
765 } else {
766 skip_networking
767 }
768 }
769
770 #[cfg(feature = "adblock")]
771 #[inline]
772 fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
774 if skip_networking {
775 true
776 } else {
777 block_ads(&event.request.url) || self.detect_ad(event)
778 }
779 }
780
781 #[cfg(not(feature = "adblock"))]
783 #[inline]
784 fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
785 use crate::handler::blockers::block_websites::block_ads;
786 if skip_networking {
787 true
788 } else {
789 block_ads(&event.request.url)
790 }
791 }
792
793 #[inline]
794 fn fail_request_blocked(
796 &mut self,
797 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
798 ) {
799 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
800 request_id.clone(),
801 chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
802 );
803 self.push_cdp_request(params);
804 }
805
806 #[inline]
807 fn fulfill_request_empty_200(
809 &mut self,
810 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
811 ) {
812 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
813 request_id.clone(),
814 200,
815 );
816 self.push_cdp_request(params);
817 }
818
819 #[cfg(feature = "_cache")]
820 #[inline]
821 fn fulfill_request_from_cache(
825 &mut self,
826 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
827 body: &[u8],
828 headers: &std::collections::HashMap<String, String>,
829 status: i64,
830 ) {
831 use crate::cdp::browser_protocol::fetch::HeaderEntry;
832 use crate::handler::network::fetch::FulfillRequestParams;
833 use base64::Engine;
834
835 let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
836
837 for (k, v) in headers.iter() {
838 resp_headers.push(HeaderEntry {
839 name: k.clone().into(),
840 value: v.clone().into(),
841 });
842 }
843
844 let mut params = FulfillRequestParams::new(request_id.clone(), status);
845
846 params.body = Some(
848 base64::engine::general_purpose::STANDARD
849 .encode(body)
850 .into(),
851 );
852
853 params.response_headers = Some(resp_headers);
854
855 self.push_cdp_request(params);
856 }
857
858 #[inline]
859 fn continue_request_with_url(
861 &mut self,
862 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
863 url: Option<&str>,
864 intercept_response: bool,
865 ) {
866 let mut params = ContinueRequestParams::new(request_id.clone());
867 if let Some(url) = url {
868 params.url = Some(url.to_string());
869 params.intercept_response = Some(intercept_response);
870 }
871 self.push_cdp_request(params);
872 }
873
874 #[inline]
876 pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
877 if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
878 return;
879 }
880
881 let resource_type = &event.resource_type;
882
883 if self.block_all {
884 tracing::debug!(
885 "Blocked (block_all): {:?} - {}",
886 event.resource_type,
887 event.request.url
888 );
889 return self.fail_request_blocked(&event.request_id);
890 }
891
892 if let Some(network_id) = event.network_id.as_ref() {
893 if let Some(request_will_be_sent) =
894 self.requests_will_be_sent.remove(network_id.as_ref())
895 {
896 self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
897 } else {
898 self.request_id_to_interception_id
899 .insert(network_id.clone(), event.request_id.clone().into());
900 }
901 }
902
903 let javascript_resource = *resource_type == ResourceType::Script;
905 let document_resource = *resource_type == ResourceType::Document;
906 let network_resource = !document_resource && crate::utils::is_data_resource(resource_type);
907
908 let mut skip_networking =
910 self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(resource_type.as_ref());
911
912 if !skip_networking {
914 skip_networking = self.document_reload_tracker >= 3;
915 }
916
917 let (current_url_cow, had_replacer) =
919 self.handle_document_replacement_and_tracking(event, document_resource);
920
921 let current_url: &str = current_url_cow.as_ref();
922
923 let blacklisted = self.is_blacklisted(current_url);
924
925 if !self.blacklist_strict && blacklisted {
926 skip_networking = true;
927 }
928
929 if !skip_networking {
930 if self.xml_document && current_url.ends_with(".xsl") {
932 skip_networking = false;
933 } else {
934 skip_networking = self.should_skip_for_visuals_and_basic(resource_type);
935 }
936 }
937
938 skip_networking = self.detect_ad_if_enabled(event, skip_networking);
939
940 if !skip_networking
942 && self.block_javascript
943 && (self.only_html || self.ignore_visuals)
944 && (javascript_resource || document_resource)
945 {
946 skip_networking = ignore_script_embedded(current_url);
947 }
948
949 if !skip_networking && javascript_resource {
952 skip_networking = self.should_block_script_blocklist_only(current_url);
953 }
954
955 skip_networking = self.skip_xhr(skip_networking, event, network_resource);
957
958 if !skip_networking && (javascript_resource || network_resource || document_resource) {
960 skip_networking = self.intercept_manager.intercept_detection(
961 current_url,
962 self.ignore_visuals,
963 network_resource,
964 );
965 }
966
967 if !skip_networking && (javascript_resource || network_resource) {
969 skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
970 }
971
972 if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
975 {
976 skip_networking = false;
977 }
978
979 if skip_networking && self.is_whitelisted(current_url) {
981 skip_networking = false;
982 }
983
984 if self.blacklist_strict && blacklisted {
985 skip_networking = true;
986 }
987
988 if skip_networking {
989 tracing::debug!("Blocked: {:?} - {}", resource_type, current_url);
990 self.fulfill_request_empty_200(&event.request_id);
991 } else {
992 #[cfg(feature = "_cache")]
993 {
994 if let (Some(policy), Some(cache_site_key)) =
995 (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
996 {
997 let current_url = format!("{}:{}", event.request.method, ¤t_url);
998
999 if let Some((res, cache_policy)) =
1000 crate::cache::remote::get_session_cache_item(cache_site_key, ¤t_url)
1001 {
1002 if policy.allows_cached(&cache_policy) {
1003 tracing::debug!(
1004 "Remote Cached: {:?} - {}",
1005 resource_type,
1006 ¤t_url
1007 );
1008 return self.fulfill_request_from_cache(
1009 &event.request_id,
1010 &res.body,
1011 &res.headers,
1012 res.status as i64,
1013 );
1014 }
1015 }
1016 }
1017 }
1018
1019 tracing::debug!("Allowed: {:?} - {}", resource_type, current_url);
1021 self.continue_request_with_url(
1022 &event.request_id,
1023 if had_replacer {
1024 Some(current_url)
1025 } else {
1026 None
1027 },
1028 !had_replacer,
1029 );
1030 }
1031 }
1032
1033 #[inline]
1039 fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1040 (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1041 || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1042 }
1043
1044 pub fn has_target_domain(&self) -> bool {
1046 !self.document_target_url.is_empty()
1047 }
1048
1049 pub fn set_page_url(&mut self, page_target_url: String) {
1051 let host_base = host_and_rest(&page_target_url)
1052 .map(|(h, _)| base_domain_from_host(h))
1053 .unwrap_or("");
1054
1055 self.document_target_domain = host_base.to_string();
1056 self.document_target_url = page_target_url;
1057 }
1058
1059 pub fn clear_target_domain(&mut self) {
1061 self.document_reload_tracker = 0;
1062 self.document_target_url = Default::default();
1063 self.document_target_domain = Default::default();
1064 }
1065
1066 #[inline]
1074 fn handle_document_replacement_and_tracking<'a>(
1075 &mut self,
1076 event: &'a EventRequestPaused,
1077 document_resource: bool,
1078 ) -> (Cow<'a, str>, bool) {
1079 let mut replacer: Option<String> = None;
1080 let current_url = event.request.url.as_str();
1081
1082 if document_resource {
1083 if self.document_target_url == current_url {
1084 self.document_reload_tracker += 1;
1085 } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1086 {
1087 let (http_document_replacement, mut https_document_replacement) =
1088 if self.document_target_url.starts_with("http://") {
1089 (
1090 self.document_target_url.replacen("http://", "http//", 1),
1091 self.document_target_url.replacen("http://", "https://", 1),
1092 )
1093 } else {
1094 (
1095 self.document_target_url.replacen("https://", "https//", 1),
1096 self.document_target_url.replacen("https://", "http://", 1),
1097 )
1098 };
1099
1100 let trailing = https_document_replacement.ends_with('/');
1102 if trailing {
1103 https_document_replacement.pop();
1104 }
1105 if https_document_replacement.ends_with('/') {
1106 https_document_replacement.pop();
1107 }
1108
1109 let redirect_mask = format!(
1110 "{}{}",
1111 https_document_replacement, http_document_replacement
1112 );
1113
1114 if current_url == redirect_mask {
1115 replacer = Some(if trailing {
1116 format!("{}/", https_document_replacement)
1117 } else {
1118 https_document_replacement
1119 });
1120 }
1121 }
1122
1123 if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1124 self.xml_document = true;
1125 }
1126
1127 self.document_target_url = event.request.url.clone();
1129 self.document_target_domain = host_and_rest(&self.document_target_url)
1130 .map(|(h, _)| base_domain_from_host(h).to_string())
1131 .unwrap_or_default();
1132 }
1133
1134 let current_url_cow = match replacer {
1135 Some(r) => Cow::Owned(r),
1136 None => Cow::Borrowed(event.request.url.as_str()),
1137 };
1138
1139 let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1140 (current_url_cow, had_replacer)
1141 }
1142
1143 #[cfg(feature = "adblock")]
1145 pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1146 use adblock::{
1147 lists::{FilterSet, ParseOptions, RuleTypes},
1148 Engine,
1149 };
1150
1151 lazy_static::lazy_static! {
1152 static ref AD_ENGINE: Engine = {
1153 let mut filter_set = FilterSet::new(false);
1154 let mut rules = ParseOptions::default();
1155 rules.rule_types = RuleTypes::All;
1156
1157 filter_set.add_filters(
1158 &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1159 rules,
1160 );
1161
1162 Engine::from_filter_set(filter_set, true)
1163 };
1164 };
1165
1166 let blockable = ResourceType::Image == event.resource_type
1167 || event.resource_type == ResourceType::Media
1168 || event.resource_type == ResourceType::Stylesheet
1169 || event.resource_type == ResourceType::Document
1170 || event.resource_type == ResourceType::Fetch
1171 || event.resource_type == ResourceType::Xhr;
1172
1173 let u = &event.request.url;
1174
1175 let block_request = blockable
1176 && {
1178 let request = adblock::request::Request::preparsed(
1179 &u,
1180 "example.com",
1181 "example.com",
1182 &event.resource_type.as_ref().to_lowercase(),
1183 !event.request.is_same_site.unwrap_or_default());
1184
1185 AD_ENGINE.check_network_request(&request).matched
1186 };
1187
1188 block_request
1189 }
1190
1191 pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1192 let response = if self
1193 .attempted_authentications
1194 .contains(event.request_id.as_ref())
1195 {
1196 AuthChallengeResponseResponse::CancelAuth
1197 } else if self.credentials.is_some() {
1198 self.attempted_authentications
1199 .insert(event.request_id.clone().into());
1200 AuthChallengeResponseResponse::ProvideCredentials
1201 } else {
1202 AuthChallengeResponseResponse::Default
1203 };
1204
1205 let mut auth = AuthChallengeResponse::new(response);
1206 if let Some(creds) = self.credentials.clone() {
1207 auth.username = Some(creds.username);
1208 auth.password = Some(creds.password);
1209 }
1210 self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1211 }
1212
1213 pub fn set_offline_mode(&mut self, value: bool) {
1215 if self.offline == value {
1216 return;
1217 }
1218 self.offline = value;
1219 if let Ok(network) = EmulateNetworkConditionsParams::builder()
1220 .offline(self.offline)
1221 .latency(0)
1222 .download_throughput(-1.)
1223 .upload_throughput(-1.)
1224 .build()
1225 {
1226 self.push_cdp_request(network);
1227 }
1228 }
1229
1230 pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1232 if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1233 if let Some(interception_id) = self
1234 .request_id_to_interception_id
1235 .remove(event.request_id.as_ref())
1236 {
1237 self.on_request(event, Some(interception_id));
1238 } else {
1239 self.requests_will_be_sent
1241 .insert(event.request_id.clone(), event.clone());
1242 }
1243 } else {
1244 self.on_request(event, None);
1245 }
1246 }
1247
1248 pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1250 if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1251 request.from_memory_cache = true;
1252 }
1253 }
1254
1255 pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1257 let mut request_failed = false;
1258
1259 let mut deducted: u64 = 0;
1261
1262 if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1263 let before = *max_bytes;
1264
1265 let received_bytes: u64 = event.response.encoded_data_length as u64;
1267
1268 let content_length: Option<u64> = event
1270 .response
1271 .headers
1272 .inner()
1273 .get("content-length")
1274 .and_then(|v| v.as_str())
1275 .and_then(|s| s.trim().parse::<u64>().ok());
1276
1277 *max_bytes = max_bytes.saturating_sub(received_bytes);
1279
1280 if let Some(cl) = content_length {
1282 if cl > *max_bytes {
1283 *max_bytes = 0;
1284 }
1285 }
1286
1287 request_failed = *max_bytes == 0;
1288
1289 deducted = before.saturating_sub(*max_bytes);
1291 }
1292
1293 if deducted > 0 {
1295 self.queued_events
1296 .push_back(NetworkEvent::BytesConsumed(deducted));
1297 }
1298
1299 if request_failed && self.max_bytes_allowed.is_some() {
1301 self.set_block_all(true);
1302 }
1303
1304 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1305 request.set_response(event.response.clone());
1306 self.queued_events.push_back(if request_failed {
1307 NetworkEvent::RequestFailed(request)
1308 } else {
1309 NetworkEvent::RequestFinished(request)
1310 });
1311 }
1312 }
1313
1314 pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1316 if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1317 if let Some(interception_id) = request.interception_id.as_ref() {
1318 self.attempted_authentications
1319 .remove(interception_id.as_ref());
1320 }
1321 self.queued_events
1322 .push_back(NetworkEvent::RequestFinished(request));
1323 }
1324 }
1325
1326 pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1328 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1329 request.failure_text = Some(event.error_text.clone());
1330 if let Some(interception_id) = request.interception_id.as_ref() {
1331 self.attempted_authentications
1332 .remove(interception_id.as_ref());
1333 }
1334 self.queued_events
1335 .push_back(NetworkEvent::RequestFailed(request));
1336 }
1337 }
1338
1339 fn on_request(
1341 &mut self,
1342 event: &EventRequestWillBeSent,
1343 interception_id: Option<InterceptionId>,
1344 ) {
1345 let mut redirect_chain = Vec::new();
1346 let mut redirect_location = None;
1347
1348 if let Some(redirect_resp) = &event.redirect_response {
1349 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1350 if is_redirect_status(redirect_resp.status) {
1351 if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1352 if redirect_resp.url != location {
1353 let fixed_location = location.replace(&redirect_resp.url, "");
1354
1355 if !fixed_location.is_empty() {
1356 request.response.as_mut().map(|resp| {
1357 resp.headers.0["Location"] =
1358 serde_json::Value::String(fixed_location.clone());
1359 });
1360 }
1361
1362 redirect_location = Some(fixed_location);
1363 }
1364 }
1365 }
1366
1367 self.handle_request_redirect(
1368 &mut request,
1369 if let Some(redirect_location) = redirect_location {
1370 let mut redirect_resp = redirect_resp.clone();
1371
1372 if !redirect_location.is_empty() {
1373 redirect_resp.headers.0["Location"] =
1374 serde_json::Value::String(redirect_location);
1375 }
1376
1377 redirect_resp
1378 } else {
1379 redirect_resp.clone()
1380 },
1381 );
1382
1383 redirect_chain = std::mem::take(&mut request.redirect_chain);
1384 redirect_chain.push(request);
1385 }
1386 }
1387
1388 let request = HttpRequest::new(
1389 event.request_id.clone(),
1390 event.frame_id.clone(),
1391 interception_id,
1392 self.user_request_interception_enabled,
1393 redirect_chain,
1394 );
1395
1396 self.requests.insert(event.request_id.clone(), request);
1397 self.queued_events
1398 .push_back(NetworkEvent::Request(event.request_id.clone()));
1399 }
1400
1401 fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1403 request.set_response(response);
1404 if let Some(interception_id) = request.interception_id.as_ref() {
1405 self.attempted_authentications
1406 .remove(interception_id.as_ref());
1407 }
1408 }
1409}
1410
1411#[derive(Debug)]
1412pub enum NetworkEvent {
1413 SendCdpRequest((MethodId, serde_json::Value)),
1415 Request(RequestId),
1417 Response(RequestId),
1419 RequestFailed(HttpRequest),
1421 RequestFinished(HttpRequest),
1423 BytesConsumed(u64),
1425}
1426
1427#[cfg(test)]
1428mod tests {
1429 use super::ALLOWED_MATCHER_3RD_PARTY;
1430 use crate::handler::network::NetworkManager;
1431 use std::time::Duration;
1432
1433 #[test]
1434 fn test_allowed_matcher_3rd_party() {
1435 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1437 assert!(
1438 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1439 "expected Cloudflare challenge script to be allowed"
1440 );
1441
1442 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1444 assert!(
1445 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1446 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1447 );
1448
1449 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1451 assert!(ALLOWED_MATCHER_3RD_PARTY
1452 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1453 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1454 }
1455
1456 #[test]
1457 fn test_script_allowed_by_default_when_not_blocklisted() {
1458 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1459 nm.set_page_url(
1460 "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1461 );
1462
1463 let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1465 assert!(
1466 !nm.should_block_script_blocklist_only(ok),
1467 "expected non-blocklisted script to be allowed"
1468 );
1469 }
1470
1471 #[test]
1472 fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1473 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1474 nm.set_page_url(
1475 "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1476 );
1477
1478 let bad = "https://cdn.example.net/js/analytics.js";
1480 assert!(
1481 nm.should_block_script_blocklist_only(bad),
1482 "expected analytics.js to be blocklisted"
1483 );
1484 }
1485
1486 #[test]
1487 fn test_allowed_matcher_3rd_party_sanity() {
1488 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1490 assert!(
1491 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1492 "expected Cloudflare challenge script to be allowed"
1493 );
1494
1495 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1497 assert!(
1498 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1499 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1500 );
1501
1502 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1503 assert!(ALLOWED_MATCHER_3RD_PARTY
1504 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1505 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1506 }
1507 #[test]
1508 fn test_dynamic_blacklist_blocks_url() {
1509 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1510 nm.set_page_url("https://example.com/".to_string());
1511
1512 nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1513 assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1514 assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1515
1516 assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1517 }
1518
1519 #[test]
1520 fn test_blacklist_strict_wins_over_whitelist() {
1521 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1522 nm.set_page_url("https://example.com/".to_string());
1523
1524 nm.set_blacklist_patterns(["beacon.min.js"]);
1526 nm.set_whitelist_patterns(["beacon.min.js"]);
1527
1528 nm.set_blacklist_strict(true);
1529
1530 let u = "https://static.cloudflareinsights.com/beacon.min.js";
1531 assert!(nm.is_whitelisted(u));
1532 assert!(nm.is_blacklisted(u));
1533
1534 assert!(nm.blacklist_strict);
1537 }
1538
1539 #[test]
1540 fn test_blacklist_non_strict_allows_whitelist_override() {
1541 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1542 nm.set_page_url("https://example.com/".to_string());
1543
1544 nm.set_blacklist_patterns(["beacon.min.js"]);
1545 nm.set_whitelist_patterns(["beacon.min.js"]);
1546
1547 nm.set_blacklist_strict(false);
1548
1549 let u = "https://static.cloudflareinsights.com/beacon.min.js";
1550 assert!(nm.is_blacklisted(u));
1551 assert!(nm.is_whitelisted(u));
1552 assert!(!nm.blacklist_strict);
1553 }
1554}