1#[cfg(any(feature = "adblock", feature = "firewall"))]
2use super::blockers::block_websites::block_ads;
3use super::blockers::{
4 block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
5 xhr::IGNORE_XHR_ASSETS,
6};
7use crate::auth::Credentials;
8#[cfg(feature = "_cache")]
9use crate::cache::BasicCachePolicy;
10use crate::cmd::CommandChain;
11use crate::handler::http::HttpRequest;
12use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
13use aho_corasick::AhoCorasick;
14use case_insensitive_string::CaseInsensitiveString;
15use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
16use chromiumoxide_cdp::cdp::browser_protocol::network::{
17 EmulateNetworkConditionsByRuleParams, EventLoadingFailed, EventLoadingFinished,
18 EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
19 InitiatorType, InterceptionId, NetworkConditions, RequestId, ResourceType, Response,
20 SetCacheDisabledParams, SetExtraHttpHeadersParams,
21};
22use chromiumoxide_cdp::cdp::browser_protocol::{
23 fetch::{
24 self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
25 ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
26 },
27 network::SetBypassServiceWorkerParams,
28};
29use chromiumoxide_cdp::cdp::browser_protocol::{
30 network::EnableParams, security::SetIgnoreCertificateErrorsParams,
31};
32use chromiumoxide_types::{Command, Method, MethodId};
33use hashbrown::{HashMap, HashSet};
34use lazy_static::lazy_static;
35use reqwest::header::PROXY_AUTHORIZATION;
36use spider_network_blocker::intercept_manager::NetworkInterceptManager;
37pub use spider_network_blocker::scripts::{
38 URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
39};
40use std::borrow::Cow;
41use std::collections::VecDeque;
42use std::time::{Duration, Instant};
43
44lazy_static! {
45 static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
47 "jquery", "angular",
49 "react", "vue", "bootstrap",
52 "d3",
53 "lodash",
54 "ajax",
55 "application",
56 "app", "main",
58 "index",
59 "bundle",
60 "vendor",
61 "runtime",
62 "polyfill",
63 "scripts",
64 "es2015.",
65 "es2020.",
66 "webpack",
67 "captcha",
68 "client",
69 "/cdn-cgi/challenge-platform/",
70 "/wp-content/js/", "https://m.stripe.network/",
73 "https://challenges.cloudflare.com/",
74 "https://www.google.com/recaptcha/",
75 "https://google.com/recaptcha/api.js",
76 "https://www.gstatic.com/recaptcha/",
77 "https://captcha.px-cloud.net/",
78 "https://geo.captcha-delivery.com/",
79 "https://api.leminnow.com/captcha/",
80 "https://cdn.auth0.com/js/lock/",
81 "https://captcha.gtimg.com",
82 "https://client-api.arkoselabs.com/",
83 "https://www.capy.me/puzzle/",
84 "https://newassets.hcaptcha.com/",
85 "https://cdn.auth0.com/client",
86 "https://js.stripe.com/",
87 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-"
90 ];
91
92 pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
97
98 static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
100 "https://m.stripe.network/",
102 "https://challenges.cloudflare.com/",
103 "https://js.stripe.com/",
104 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-",
107 "https://ct.captcha-delivery.com/",
108 "https://geo.captcha-delivery.com/",
109 "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", "https://cdn.auth0.com/client",
111 "https://captcha.px-cloud.net/",
112 "https://www.capy.me/puzzle/",
113 "https://www.gstatic.com/recaptcha/",
114 "https://google.com/recaptcha/",
115 "https://www.google.com/recaptcha/",
116 "https://www.recaptcha.net/recaptcha/",
117 "https://js.hcaptcha.com/1/api.js",
118 "https://hcaptcha.com/1/api.js",
119 "https://js.datadome.co/tags.js",
120 "https://api-js.datadome.co/",
121 "https://client.perimeterx.net/",
122 "https://captcha.px-cdn.net/",
123 "https://newassets.hcaptcha.com/",
124 "https://captcha.px-cloud.net/",
125 "https://s.perimeterx.net/",
126 "https://api.leminnow.com/captcha/",
127 "https://client-api.arkoselabs.com/",
128 "https://static.geetest.com/v4/gt4.js",
129 "https://static.geetest.com/",
130 "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
131 "https://cdn.perfdrive.com/aperture/",
132 "https://assets.queue-it.net/",
133 "discourse-cdn.com/",
134 "hcaptcha.com",
135 "/cdn-cgi/challenge-platform/",
136 "/_Incapsula_Resource"
137 ];
138
139 pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
141
142 pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
144 phf::phf_set! {
145 "_astro/", "_app/immutable"
147 }
148 };
149
150 pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
152 "application/pdf",
153 "application/zip",
154 "application/x-rar-compressed",
155 "application/x-tar",
156 "image/png",
157 "image/jpeg",
158 "image/gif",
159 "image/bmp",
160 "image/webp",
161 "image/svg+xml",
162 "video/mp4",
163 "video/x-msvideo",
164 "video/x-matroska",
165 "video/webm",
166 "audio/mpeg",
167 "audio/ogg",
168 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
169 "application/vnd.ms-excel",
170 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
171 "application/vnd.ms-powerpoint",
172 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
173 "application/x-7z-compressed",
174 "application/x-rpm",
175 "application/x-shockwave-flash",
176 "application/rtf",
177 };
178
179 pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
181 "Image",
182 "Media",
183 "Font"
184 };
185
186 pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
188 "CspViolationReport",
189 "Ping",
190 };
191
192 pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
194
195 pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
197 let enable = EnableParams::default();
198
199 if let Ok(c) = serde_json::to_value(&enable) {
200 vec![(enable.identifier(), c)]
201 } else {
202 vec![]
203 }
204 };
205
206 pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
208 let enable = EnableParams::default();
209 let mut v = vec![];
210 if let Ok(c) = serde_json::to_value(&enable) {
211 v.push((enable.identifier(), c));
212 }
213 let ignore = SetIgnoreCertificateErrorsParams::new(true);
214 if let Ok(ignored) = serde_json::to_value(&ignore) {
215 v.push((ignore.identifier(), ignored));
216 }
217
218 v
219 };
220
221 pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
223 fetch::EnableParams::builder()
224 .handle_auth_requests(true)
225 .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
226 .build()
227 };
228}
229
230pub(crate) fn is_redirect_status(status: i64) -> bool {
232 matches!(status, 301 | 302 | 303 | 307 | 308)
233}
234
235const STALE_BUFFER_SECS: u64 = 30;
240
241const STALE_REQUEST_SECS: u64 = 120;
247
248#[cfg(feature = "adblock")]
250pub struct AdblockEngine(std::sync::Arc<adblock::Engine>);
251
252#[cfg(feature = "adblock")]
253impl std::fmt::Debug for AdblockEngine {
254 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
255 f.debug_struct("AdblockEngine").finish()
256 }
257}
258
259#[cfg(feature = "adblock")]
260impl std::ops::Deref for AdblockEngine {
261 type Target = adblock::Engine;
262 fn deref(&self) -> &Self::Target {
263 &self.0
264 }
265}
266
267#[derive(Debug)]
268pub struct NetworkManager {
270 queued_events: VecDeque<NetworkEvent>,
276 ignore_httpserrors: bool,
281 requests: HashMap<RequestId, HttpRequest>,
286 requests_will_be_sent: HashMap<RequestId, (EventRequestWillBeSent, Instant)>,
293 extra_headers: std::collections::HashMap<String, String>,
298 request_id_to_interception_id: HashMap<RequestId, (InterceptionId, Instant)>,
305 user_cache_disabled: bool,
310 attempted_authentications: HashSet<RequestId>,
316 credentials: Option<Credentials>,
321 pub(crate) user_request_interception_enabled: bool,
330 block_all: bool,
337 pub(crate) protocol_request_interception_enabled: bool,
343 offline: bool,
345 pub request_timeout: Duration,
347 pub ignore_visuals: bool,
350 pub block_stylesheets: bool,
352 pub block_javascript: bool,
357 pub block_analytics: bool,
359 pub block_prefetch: bool,
361 pub only_html: bool,
363 pub xml_document: bool,
365 pub intercept_manager: NetworkInterceptManager,
367 pub document_reload_tracker: u8,
369 pub document_target_url: String,
371 pub document_target_domain: String,
373 pub max_bytes_allowed: Option<u64>,
375 pub max_redirects: Option<usize>,
383 #[cfg(feature = "_cache")]
384 pub cache_site_key: Option<String>,
386 #[cfg(feature = "_cache")]
388 pub cache_policy: Option<BasicCachePolicy>,
389 whitelist_patterns: Vec<String>,
391 whitelist_matcher: Option<AhoCorasick>,
393 blacklist_patterns: Vec<String>,
395 blacklist_matcher: Option<AhoCorasick>,
397 blacklist_strict: bool,
399 #[cfg(feature = "adblock")]
402 adblock_engine: Option<AdblockEngine>,
403}
404
405impl NetworkManager {
406 pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
408 Self {
409 queued_events: Default::default(),
410 ignore_httpserrors,
411 requests: Default::default(),
412 requests_will_be_sent: Default::default(),
413 extra_headers: Default::default(),
414 request_id_to_interception_id: Default::default(),
415 user_cache_disabled: false,
416 attempted_authentications: Default::default(),
417 credentials: None,
418 block_all: false,
419 user_request_interception_enabled: false,
420 protocol_request_interception_enabled: false,
421 offline: false,
422 request_timeout,
423 ignore_visuals: false,
424 block_javascript: false,
425 block_stylesheets: false,
426 block_prefetch: true,
427 block_analytics: true,
428 only_html: false,
429 xml_document: false,
430 intercept_manager: NetworkInterceptManager::Unknown,
431 document_reload_tracker: 0,
432 document_target_url: String::new(),
433 document_target_domain: String::new(),
434 whitelist_patterns: Vec::new(),
435 whitelist_matcher: None,
436 blacklist_patterns: Vec::new(),
437 blacklist_matcher: None,
438 blacklist_strict: true,
439 max_bytes_allowed: None,
440 max_redirects: None,
441 #[cfg(feature = "_cache")]
442 cache_site_key: None,
443 #[cfg(feature = "_cache")]
444 cache_policy: None,
445 #[cfg(feature = "adblock")]
446 adblock_engine: None,
447 }
448 }
449
450 #[cfg(feature = "adblock")]
452 pub fn set_adblock_engine(&mut self, engine: std::sync::Arc<adblock::Engine>) {
453 self.adblock_engine = Some(AdblockEngine(engine));
454 }
455
456 pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
458 where
459 I: IntoIterator<Item = S>,
460 S: Into<String>,
461 {
462 self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
463 self.rebuild_whitelist_matcher();
464 }
465
466 pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
468 where
469 I: IntoIterator<Item = S>,
470 S: Into<String>,
471 {
472 self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
473 self.rebuild_blacklist_matcher();
474 }
475
476 pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
478 self.blacklist_patterns.push(pattern.into());
479 self.rebuild_blacklist_matcher();
480 }
481
482 pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
484 where
485 I: IntoIterator<Item = S>,
486 S: Into<String>,
487 {
488 self.blacklist_patterns
489 .extend(patterns.into_iter().map(Into::into));
490 self.rebuild_blacklist_matcher();
491 }
492
493 pub fn clear_blacklist(&mut self) {
495 self.blacklist_patterns.clear();
496 self.blacklist_matcher = None;
497 }
498
499 pub fn set_blacklist_strict(&mut self, strict: bool) {
501 self.blacklist_strict = strict;
502 }
503
504 #[inline]
505 fn rebuild_blacklist_matcher(&mut self) {
506 if self.blacklist_patterns.is_empty() {
507 self.blacklist_matcher = None;
508 return;
509 }
510
511 self.blacklist_matcher =
512 AhoCorasick::new(self.blacklist_patterns.iter().map(|s| s.as_str())).ok();
513 }
514
515 #[inline]
516 fn is_blacklisted(&self, url: &str) -> bool {
517 self.blacklist_matcher
518 .as_ref()
519 .map(|m| m.is_match(url))
520 .unwrap_or(false)
521 }
522
523 pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
525 self.whitelist_patterns.push(pattern.into());
526 self.rebuild_whitelist_matcher();
527 }
528
529 pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
531 where
532 I: IntoIterator<Item = S>,
533 S: Into<String>,
534 {
535 self.whitelist_patterns
536 .extend(patterns.into_iter().map(Into::into));
537 self.rebuild_whitelist_matcher();
538 }
539
540 #[inline]
541 fn rebuild_whitelist_matcher(&mut self) {
542 if self.whitelist_patterns.is_empty() {
543 self.whitelist_matcher = None;
544 return;
545 }
546
547 self.whitelist_matcher =
549 AhoCorasick::new(self.whitelist_patterns.iter().map(|s| s.as_str())).ok();
550 }
551
552 #[inline]
553 fn is_whitelisted(&self, url: &str) -> bool {
554 self.whitelist_matcher
555 .as_ref()
556 .map(|m| m.is_match(url))
557 .unwrap_or(false)
558 }
559
560 pub fn init_commands(&self) -> CommandChain {
562 let cmds = if self.ignore_httpserrors {
563 INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
564 } else {
565 INIT_CHAIN.clone()
566 };
567 CommandChain::new(cmds, self.request_timeout)
568 }
569
570 pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
572 let method = cmd.identifier();
573 if let Ok(params) = serde_json::to_value(cmd) {
574 self.queued_events
575 .push_back(NetworkEvent::SendCdpRequest((method, params)));
576 }
577 }
578
579 pub fn poll(&mut self) -> Option<NetworkEvent> {
581 self.queued_events.pop_front()
582 }
583
584 pub fn evict_stale_entries(&mut self, now: Instant) {
589 let cutoff = now - Duration::from_secs(STALE_BUFFER_SECS);
590
591 self.requests_will_be_sent.retain(|_, (_, ts)| *ts > cutoff);
592 self.request_id_to_interception_id
593 .retain(|_, (_, ts)| *ts > cutoff);
594
595 let request_cutoff = now - Duration::from_secs(STALE_REQUEST_SECS);
600 self.requests
601 .retain(|_, req| req.created_at > request_cutoff);
602
603 if !self.attempted_authentications.is_empty() {
608 let live: HashSet<&str> = self
609 .requests
610 .values()
611 .filter_map(|r| r.interception_id.as_ref().map(|id| id.as_ref()))
612 .collect();
613 self.attempted_authentications
614 .retain(|id| live.contains(id.as_ref()));
615 }
616 }
617
618 pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
620 &self.extra_headers
621 }
622
623 pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
625 self.extra_headers = headers;
626 self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
627 self.extra_headers.remove("Proxy-Authorization");
628 if !self.extra_headers.is_empty() {
629 if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
630 self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
631 }
632 }
633 }
634
635 pub fn set_service_worker_enabled(&mut self, bypass: bool) {
636 self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
637 }
638
639 pub fn set_block_all(&mut self, block_all: bool) {
640 self.block_all = block_all;
641 }
642
643 pub fn set_request_interception(&mut self, enabled: bool) {
644 self.user_request_interception_enabled = enabled;
645 self.update_protocol_request_interception();
646 }
647
648 pub fn set_cache_enabled(&mut self, enabled: bool) {
649 let run = self.user_cache_disabled == enabled;
650 self.user_cache_disabled = !enabled;
651 if run {
652 self.update_protocol_cache_disabled();
653 }
654 }
655
656 pub fn enable_request_intercept(&mut self) {
658 self.protocol_request_interception_enabled = true;
659 }
660
661 pub fn disable_request_intercept(&mut self) {
663 self.protocol_request_interception_enabled = false;
664 }
665
666 #[cfg(feature = "_cache")]
668 pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
669 self.cache_site_key = cache_site_key;
670 }
671
672 #[cfg(feature = "_cache")]
674 pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
675 self.cache_policy = cache_policy;
676 }
677
678 pub fn update_protocol_cache_disabled(&mut self) {
679 self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
680 }
681
682 pub fn authenticate(&mut self, credentials: Credentials) {
683 self.credentials = Some(credentials);
684 self.update_protocol_request_interception();
685 self.protocol_request_interception_enabled = true;
686 }
687
688 fn update_protocol_request_interception(&mut self) {
689 let enabled = self.user_request_interception_enabled || self.credentials.is_some();
690
691 if enabled == self.protocol_request_interception_enabled {
692 return;
693 }
694
695 if enabled {
696 self.push_cdp_request(ENABLE_FETCH.clone())
697 } else {
698 self.push_cdp_request(DisableParams::default())
699 }
700 }
701
702 #[inline]
705 fn should_block_script_blocklist_only(&self, url: &str) -> bool {
706 let block_analytics = self.block_analytics;
708
709 if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
711 {
712 return true;
713 }
714
715 if crate::handler::blockers::block_websites::block_website(url) {
717 return true;
718 }
719
720 if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
727 let p_slash = Self::strip_query_fragment(path_with_slash);
729 let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
730
731 let base = match p_slash.rsplit('/').next() {
733 Some(b) => b,
734 None => p_slash,
735 };
736
737 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
740 return true;
741 }
742 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
743 return true;
744 }
745 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
746 return true;
747 }
748
749 if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
752 return true;
753 }
754
755 if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
757 return true;
758 }
759 }
760
761 false
762 }
763
764 #[inline]
769 fn url_path_with_leading_slash(url: &str) -> Option<&str> {
770 let bytes = url.as_bytes();
772 let idx = memchr::memmem::find(bytes, b"//")?;
773 let after_slashes = idx + 2;
774
775 let slash_rel = memchr::memchr(b'/', &bytes[after_slashes..])?;
777 let slash_idx = after_slashes + slash_rel;
778
779 if slash_idx < url.len() {
780 Some(&url[slash_idx..])
781 } else {
782 None
783 }
784 }
785
786 #[inline]
791 fn strip_query_fragment(s: &str) -> &str {
792 match memchr::memchr2(b'?', b'#', s.as_bytes()) {
793 Some(i) => &s[..i],
794 None => s,
795 }
796 }
797
798 #[inline]
800 fn skip_xhr(
801 &self,
802 skip_networking: bool,
803 event: &EventRequestPaused,
804 network_event: bool,
805 ) -> bool {
806 if !skip_networking && network_event {
808 let request_url = event.request.url.as_str();
809
810 let skip_analytics =
812 self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
813
814 if skip_analytics {
815 true
816 } else if self.block_stylesheets || self.ignore_visuals {
817 let block_css = self.block_stylesheets;
818 let block_media = self.ignore_visuals;
819
820 let mut block_request = false;
821
822 if let Some(position) = memchr::memrchr(b'.', request_url.as_bytes()) {
823 let hlen = request_url.len();
824 let has_asset = hlen - position;
825
826 if has_asset >= 3 {
827 let next_position = position + 1;
828
829 if block_media
830 && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
831 &request_url[next_position..].into(),
832 )
833 {
834 block_request = true;
835 } else if block_css {
836 block_request = CaseInsensitiveString::from(
837 &request_url.as_bytes()[next_position..],
838 )
839 .contains(&**CSS_EXTENSION)
840 }
841 }
842 }
843
844 if !block_request {
845 block_request = ignore_script_xhr_media(request_url);
846 }
847
848 block_request
849 } else {
850 skip_networking
851 }
852 } else {
853 skip_networking
854 }
855 }
856
857 #[cfg(feature = "adblock")]
858 #[inline]
859 fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
861 if skip_networking {
862 true
863 } else {
864 block_ads(&event.request.url) || self.detect_ad(event)
865 }
866 }
867
868 #[cfg(not(feature = "adblock"))]
870 #[inline]
871 fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
872 use crate::handler::blockers::block_websites::block_ads;
873 if skip_networking {
874 true
875 } else {
876 block_ads(&event.request.url)
877 }
878 }
879
880 #[inline]
881 fn fail_request_blocked(
883 &mut self,
884 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
885 ) {
886 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
887 request_id.clone(),
888 chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
889 );
890 self.push_cdp_request(params);
891 }
892
893 #[inline]
894 fn fulfill_request_empty_200(
896 &mut self,
897 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
898 ) {
899 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
900 request_id.clone(),
901 200,
902 );
903 self.push_cdp_request(params);
904 }
905
906 #[cfg(feature = "_cache")]
907 #[inline]
908 fn fulfill_request_from_cache(
912 &mut self,
913 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
914 body: &[u8],
915 headers: &std::collections::HashMap<String, String>,
916 status: i64,
917 ) {
918 use crate::cdp::browser_protocol::fetch::HeaderEntry;
919 use crate::handler::network::fetch::FulfillRequestParams;
920 use base64::Engine;
921
922 let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
923
924 for (k, v) in headers.iter() {
925 resp_headers.push(HeaderEntry {
926 name: k.clone(),
927 value: v.clone(),
928 });
929 }
930
931 let mut params = FulfillRequestParams::new(request_id.clone(), status);
932
933 params.body = Some(
935 base64::engine::general_purpose::STANDARD
936 .encode(body)
937 .into(),
938 );
939
940 params.response_headers = Some(resp_headers);
941
942 self.push_cdp_request(params);
943 }
944
945 #[inline]
946 fn continue_request_with_url(
948 &mut self,
949 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
950 url: Option<&str>,
951 intercept_response: bool,
952 ) {
953 let mut params = ContinueRequestParams::new(request_id.clone());
954 if let Some(url) = url {
955 params.url = Some(url.to_string());
956 params.intercept_response = Some(intercept_response);
957 }
958 self.push_cdp_request(params);
959 }
960
961 #[inline]
963 pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
964 if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
965 return;
966 }
967
968 if self.block_all {
969 tracing::debug!(
970 "Blocked (block_all): {:?} - {}",
971 event.resource_type,
972 event.request.url
973 );
974 return self.fail_request_blocked(&event.request_id);
975 }
976
977 let initiator_type: Option<InitiatorType> = event
989 .network_id
990 .as_ref()
991 .and_then(|nid| self.requests_will_be_sent.get(nid.as_ref()))
992 .map(|(rwbs, _)| rwbs.initiator.r#type.clone());
993
994 if let Some(network_id) = event.network_id.as_ref() {
995 if let Some((request_will_be_sent, _)) =
996 self.requests_will_be_sent.remove(network_id.as_ref())
997 {
998 self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
999 } else {
1000 self.request_id_to_interception_id.insert(
1001 network_id.clone(),
1002 (event.request_id.clone().into(), Instant::now()),
1003 );
1004 }
1005 }
1006
1007 let javascript_resource = event.resource_type == ResourceType::Script;
1009 let document_resource = event.resource_type == ResourceType::Document;
1010 let network_resource =
1011 !document_resource && crate::utils::is_data_resource(&event.resource_type);
1012
1013 let mut skip_networking =
1015 self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(event.resource_type.as_ref());
1016
1017 if event.resource_type == ResourceType::Prefetch && !self.block_prefetch {
1018 skip_networking = true;
1019 }
1020
1021 if !skip_networking {
1023 skip_networking = self.document_reload_tracker >= 3;
1024 }
1025
1026 let (current_url_cow, had_replacer) =
1028 self.handle_document_replacement_and_tracking(event, document_resource);
1029
1030 let current_url: &str = current_url_cow.as_ref();
1031
1032 let blacklisted = self.is_blacklisted(current_url);
1033
1034 if !self.blacklist_strict && blacklisted {
1035 skip_networking = true;
1036 }
1037
1038 if !skip_networking {
1039 if self.xml_document && current_url.ends_with(".xsl") {
1041 skip_networking = false;
1042 } else {
1043 skip_networking = self.should_skip_for_visuals_and_basic(&event.resource_type);
1044 }
1045 }
1046
1047 let is_main_document_request = document_resource
1070 && (event.redirected_request_id.is_some()
1071 || had_replacer
1072 || self.document_target_url.is_empty()
1073 || event.request.url == self.document_target_url);
1074 if !is_main_document_request {
1075 skip_networking = self.detect_ad_if_enabled(event, skip_networking);
1076 }
1077
1078 if !skip_networking
1080 && self.block_javascript
1081 && (self.only_html || self.ignore_visuals)
1082 && (javascript_resource
1083 || document_resource
1084 || event.resource_type == ResourceType::Stylesheet
1085 || event.resource_type == ResourceType::Image)
1086 {
1087 skip_networking = ignore_script_embedded(current_url);
1088 }
1089
1090 if !skip_networking && javascript_resource {
1093 skip_networking = self.should_block_script_blocklist_only(current_url);
1094 }
1095
1096 skip_networking = self.skip_xhr(skip_networking, event, network_resource);
1098
1099 if !skip_networking && (javascript_resource || network_resource || document_resource) {
1101 skip_networking = self.intercept_manager.intercept_detection(
1102 current_url,
1103 self.ignore_visuals,
1104 network_resource,
1105 );
1106 }
1107
1108 if !skip_networking && (javascript_resource || network_resource) {
1110 skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
1111 }
1112
1113 if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
1116 {
1117 skip_networking = false;
1118 }
1119
1120 if skip_networking && self.is_whitelisted(current_url) {
1122 skip_networking = false;
1123 }
1124
1125 if skip_networking
1152 && self.block_stylesheets
1153 && event.resource_type == ResourceType::Stylesheet
1154 && !matches!(initiator_type, Some(InitiatorType::Script))
1155 {
1156 skip_networking = false;
1157 }
1158
1159 if self.blacklist_strict && blacklisted {
1160 skip_networking = true;
1161 }
1162
1163 if skip_networking {
1164 tracing::debug!("Blocked: {:?} - {}", event.resource_type, current_url);
1165 self.fulfill_request_empty_200(&event.request_id);
1166 } else {
1167 #[cfg(feature = "_cache")]
1168 {
1169 if let (Some(policy), Some(cache_site_key)) =
1170 (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
1171 {
1172 let current_url = format!("{}:{}", event.request.method, ¤t_url);
1173
1174 if let Some((res, cache_policy)) =
1175 crate::cache::remote::get_session_cache_item(cache_site_key, ¤t_url)
1176 {
1177 if policy.allows_cached(&cache_policy) {
1178 tracing::debug!(
1179 "Remote Cached: {:?} - {}",
1180 &event.resource_type,
1181 ¤t_url
1182 );
1183 let flat_headers = crate::http::headers_from_multi(&res.headers);
1184 return self.fulfill_request_from_cache(
1185 &event.request_id,
1186 &res.body,
1187 &flat_headers,
1188 res.status as i64,
1189 );
1190 }
1191 }
1192 }
1193 }
1194
1195 tracing::debug!("Allowed: {:?} - {}", event.resource_type, current_url);
1197 self.continue_request_with_url(
1198 &event.request_id,
1199 if had_replacer {
1200 Some(current_url)
1201 } else {
1202 None
1203 },
1204 !had_replacer,
1205 );
1206 }
1207 }
1208
1209 #[inline]
1215 fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1216 (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1217 || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1218 }
1219
1220 pub fn has_target_domain(&self) -> bool {
1222 !self.document_target_url.is_empty()
1223 }
1224
1225 pub fn set_page_url(&mut self, page_target_url: String) {
1227 let host_base = host_and_rest(&page_target_url)
1228 .map(|(h, _)| base_domain_from_host(h))
1229 .unwrap_or("");
1230
1231 self.document_target_domain = host_base.to_string();
1232 self.document_target_url = page_target_url;
1233 }
1234
1235 pub fn clear_target_domain(&mut self) {
1237 self.document_reload_tracker = 0;
1238 self.document_target_url = Default::default();
1239 self.document_target_domain = Default::default();
1240 }
1241
1242 #[inline]
1250 fn handle_document_replacement_and_tracking<'a>(
1251 &mut self,
1252 event: &'a EventRequestPaused,
1253 document_resource: bool,
1254 ) -> (Cow<'a, str>, bool) {
1255 let mut replacer: Option<String> = None;
1256 let current_url = event.request.url.as_str();
1257
1258 if document_resource {
1259 if self.document_target_url == current_url {
1260 self.document_reload_tracker += 1;
1261 } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1262 {
1263 let (http_document_replacement, mut https_document_replacement) =
1264 if self.document_target_url.starts_with("http://") {
1265 (
1266 self.document_target_url.replacen("http://", "http//", 1),
1267 self.document_target_url.replacen("http://", "https://", 1),
1268 )
1269 } else {
1270 (
1271 self.document_target_url.replacen("https://", "https//", 1),
1272 self.document_target_url.replacen("https://", "http://", 1),
1273 )
1274 };
1275
1276 let trailing = https_document_replacement.ends_with('/');
1278 if trailing {
1279 https_document_replacement.pop();
1280 }
1281 if https_document_replacement.ends_with('/') {
1282 https_document_replacement.pop();
1283 }
1284
1285 let redirect_mask = format!(
1286 "{}{}",
1287 https_document_replacement, http_document_replacement
1288 );
1289
1290 if current_url == redirect_mask {
1291 replacer = Some(if trailing {
1292 format!("{}/", https_document_replacement)
1293 } else {
1294 https_document_replacement
1295 });
1296 }
1297 }
1298
1299 if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1300 self.xml_document = true;
1301 }
1302
1303 self.document_target_url = event.request.url.clone();
1305 self.document_target_domain = host_and_rest(&self.document_target_url)
1306 .map(|(h, _)| base_domain_from_host(h).to_string())
1307 .unwrap_or_default();
1308 }
1309
1310 let current_url_cow = match replacer {
1311 Some(r) => Cow::Owned(r),
1312 None => Cow::Borrowed(event.request.url.as_str()),
1313 };
1314
1315 let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1316 (current_url_cow, had_replacer)
1317 }
1318
1319 #[cfg(feature = "adblock")]
1323 pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1324 use adblock::{
1325 lists::{FilterSet, ParseOptions, RuleTypes},
1326 Engine,
1327 };
1328
1329 lazy_static::lazy_static! {
1330 static ref AD_ENGINE: Engine = {
1331 let mut filter_set = FilterSet::new(false);
1332 let mut rules = ParseOptions::default();
1333 rules.rule_types = RuleTypes::All;
1334
1335 filter_set.add_filters(
1336 &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1337 rules,
1338 );
1339
1340 #[cfg(feature = "adblock_easylist")]
1343 {
1344 static EASYLIST: &str = include_str!(concat!(env!("OUT_DIR"), "/easylist.txt"));
1345 static EASYPRIVACY: &str = include_str!(concat!(env!("OUT_DIR"), "/easyprivacy.txt"));
1346
1347 if !EASYLIST.is_empty() {
1348 filter_set.add_filter_list(EASYLIST, rules);
1349 }
1350 if !EASYPRIVACY.is_empty() {
1351 filter_set.add_filter_list(EASYPRIVACY, rules);
1352 }
1353 }
1354
1355 Engine::from_filter_set(filter_set, true)
1356 };
1357 }
1358
1359 let blockable = event.resource_type == ResourceType::Script
1360 || event.resource_type == ResourceType::Image
1361 || event.resource_type == ResourceType::Media
1362 || event.resource_type == ResourceType::Stylesheet
1363 || event.resource_type == ResourceType::Document
1364 || event.resource_type == ResourceType::Fetch
1365 || event.resource_type == ResourceType::Xhr;
1366
1367 if !blockable {
1368 return false;
1369 }
1370
1371 let u = &event.request.url;
1372
1373 let source_domain = if self.document_target_domain.is_empty() {
1374 "example.com"
1375 } else {
1376 &self.document_target_domain
1377 };
1378
1379 let hostname = u
1382 .strip_prefix("https://")
1383 .or_else(|| u.strip_prefix("http://"))
1384 .and_then(|rest| rest.split('/').next())
1385 .map(
1387 |authority| match memchr::memrchr(b'@', authority.as_bytes()) {
1388 Some(i) => &authority[i + 1..],
1389 None => authority,
1390 },
1391 )
1392 .and_then(|host_port| host_port.split(':').next())
1394 .unwrap_or(source_domain);
1395
1396 let resource_type_str = match event.resource_type {
1397 ResourceType::Script => "script",
1398 ResourceType::Image => "image",
1399 ResourceType::Media => "media",
1400 ResourceType::Stylesheet => "stylesheet",
1401 ResourceType::Document => "document",
1402 ResourceType::Fetch => "fetch",
1403 ResourceType::Xhr => "xhr",
1404 _ => "other",
1405 };
1406
1407 let request = adblock::request::Request::preparsed(
1408 u,
1409 hostname,
1410 source_domain,
1411 resource_type_str,
1412 !event.request.is_same_site.unwrap_or_default(),
1413 );
1414
1415 let engine: &Engine = match self.adblock_engine.as_ref() {
1416 Some(custom) => custom,
1417 None => &AD_ENGINE,
1418 };
1419
1420 engine.check_network_request(&request).matched
1421 }
1422
1423 pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1424 let response = if self
1425 .attempted_authentications
1426 .contains(event.request_id.as_ref())
1427 {
1428 AuthChallengeResponseResponse::CancelAuth
1429 } else if self.credentials.is_some() {
1430 self.attempted_authentications
1431 .insert(event.request_id.clone().into());
1432 AuthChallengeResponseResponse::ProvideCredentials
1433 } else {
1434 AuthChallengeResponseResponse::Default
1435 };
1436
1437 let mut auth = AuthChallengeResponse::new(response);
1438 if let Some(creds) = self.credentials.clone() {
1439 auth.username = Some(creds.username);
1440 auth.password = Some(creds.password);
1441 }
1442 self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1443 }
1444
1445 pub fn set_offline_mode(&mut self, value: bool) {
1447 if self.offline == value {
1448 return;
1449 }
1450 self.offline = value;
1451 if let Ok(condition) = NetworkConditions::builder()
1452 .url_pattern("")
1453 .latency(0)
1454 .download_throughput(-1.)
1455 .upload_throughput(-1.)
1456 .build()
1457 {
1458 if let Ok(network) = EmulateNetworkConditionsByRuleParams::builder()
1459 .offline(self.offline)
1460 .matched_network_condition(condition)
1461 .build()
1462 {
1463 self.push_cdp_request(network);
1464 }
1465 }
1466 }
1467
1468 pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1470 if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1471 if let Some((interception_id, _)) = self
1472 .request_id_to_interception_id
1473 .remove(event.request_id.as_ref())
1474 {
1475 self.on_request(event, Some(interception_id));
1476 } else {
1477 self.requests_will_be_sent
1478 .insert(event.request_id.clone(), (event.clone(), Instant::now()));
1479 }
1480 } else {
1481 self.on_request(event, None);
1482 }
1483 }
1484
1485 pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1487 if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1488 request.from_memory_cache = true;
1489 }
1490 }
1491
1492 pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1494 let mut request_failed = false;
1495
1496 let mut deducted: u64 = 0;
1498
1499 if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1500 let before = *max_bytes;
1501
1502 let received_bytes: u64 = event.response.encoded_data_length as u64;
1504
1505 let content_length: Option<u64> = event
1507 .response
1508 .headers
1509 .inner()
1510 .get("content-length")
1511 .and_then(|v| v.as_str())
1512 .and_then(|s| s.trim().parse::<u64>().ok());
1513
1514 *max_bytes = max_bytes.saturating_sub(received_bytes);
1516
1517 if let Some(cl) = content_length {
1519 if cl > *max_bytes {
1520 *max_bytes = 0;
1521 }
1522 }
1523
1524 request_failed = *max_bytes == 0;
1525
1526 deducted = before.saturating_sub(*max_bytes);
1528 }
1529
1530 if deducted > 0 {
1532 self.queued_events
1533 .push_back(NetworkEvent::BytesConsumed(deducted));
1534 }
1535
1536 if request_failed && self.max_bytes_allowed.is_some() {
1538 self.set_block_all(true);
1539 }
1540
1541 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1542 request.set_response(event.response.clone());
1543 self.queued_events.push_back(if request_failed {
1544 NetworkEvent::RequestFailed(request)
1545 } else {
1546 NetworkEvent::RequestFinished(request)
1547 });
1548 }
1549 }
1550
1551 pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1553 if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1554 if let Some(interception_id) = request.interception_id.as_ref() {
1555 self.attempted_authentications
1556 .remove(interception_id.as_ref());
1557 }
1558 self.queued_events
1559 .push_back(NetworkEvent::RequestFinished(request));
1560 }
1561 }
1562
1563 pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1565 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1566 request.failure_text = Some(event.error_text.clone());
1567 if let Some(interception_id) = request.interception_id.as_ref() {
1568 self.attempted_authentications
1569 .remove(interception_id.as_ref());
1570 }
1571 self.queued_events
1572 .push_back(NetworkEvent::RequestFailed(request));
1573 }
1574 }
1575
1576 fn on_request(
1578 &mut self,
1579 event: &EventRequestWillBeSent,
1580 interception_id: Option<InterceptionId>,
1581 ) {
1582 let mut redirect_chain = Vec::new();
1583 let mut redirect_location = None;
1584
1585 if let Some(redirect_resp) = &event.redirect_response {
1586 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1587 if is_redirect_status(redirect_resp.status) {
1588 if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1589 if redirect_resp.url != location {
1590 let fixed_location = location.replace(&redirect_resp.url, "");
1591
1592 if !fixed_location.is_empty() {
1593 if let Some(resp) = request.response.as_mut() {
1594 resp.headers.0["Location"] =
1595 serde_json::Value::String(fixed_location.clone());
1596 }
1597 }
1598
1599 redirect_location = Some(fixed_location);
1600 }
1601 }
1602 }
1603
1604 {
1605 let mut redirect_resp = redirect_resp.clone();
1606
1607 if let Some(redirect_location) = redirect_location {
1608 if !redirect_location.is_empty() {
1609 redirect_resp.headers.0["Location"] =
1610 serde_json::Value::String(redirect_location);
1611 }
1612 }
1613
1614 self.handle_request_redirect(&mut request, redirect_resp);
1615 }
1616
1617 redirect_chain = std::mem::take(&mut request.redirect_chain);
1618 redirect_chain.push(request);
1619 }
1620 }
1621
1622 if let Some(cap) = self.max_redirects {
1625 let is_document = matches!(event.r#type, Some(ResourceType::Document));
1626 if is_document && redirect_chain.len() > cap {
1627 let mut failed = HttpRequest::new(
1628 event.request_id.clone(),
1629 event.frame_id.clone(),
1630 interception_id,
1631 self.user_request_interception_enabled,
1632 redirect_chain,
1633 );
1634 failed.url = Some(event.request.url.clone());
1635 failed.method = Some(event.request.method.clone());
1636 failed.failure_text = Some("net::ERR_TOO_MANY_REDIRECTS".into());
1637 self.push_cdp_request(
1638 chromiumoxide_cdp::cdp::browser_protocol::page::StopLoadingParams::default(),
1639 );
1640 self.queued_events
1641 .push_back(NetworkEvent::RequestFailed(failed));
1642 return;
1643 }
1644 }
1645
1646 let request = HttpRequest::new(
1647 event.request_id.clone(),
1648 event.frame_id.clone(),
1649 interception_id,
1650 self.user_request_interception_enabled,
1651 redirect_chain,
1652 );
1653
1654 let rid = event.request_id.clone();
1655 self.queued_events
1656 .push_back(NetworkEvent::Request(rid.clone()));
1657 self.requests.insert(rid, request);
1658 }
1659
1660 fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1662 request.set_response(response);
1663 if let Some(interception_id) = request.interception_id.as_ref() {
1664 self.attempted_authentications
1665 .remove(interception_id.as_ref());
1666 }
1667 }
1668}
1669
1670#[derive(Debug)]
1671pub enum NetworkEvent {
1672 SendCdpRequest((MethodId, serde_json::Value)),
1674 Request(RequestId),
1676 Response(RequestId),
1678 RequestFailed(HttpRequest),
1680 RequestFinished(HttpRequest),
1682 BytesConsumed(u64),
1684}
1685
1686#[cfg(test)]
1687mod tests {
1688 use super::ALLOWED_MATCHER_3RD_PARTY;
1689 use crate::handler::network::NetworkManager;
1690 use std::time::Duration;
1691
1692 #[test]
1693 fn test_allowed_matcher_3rd_party() {
1694 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1696 assert!(
1697 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1698 "expected Cloudflare challenge script to be allowed"
1699 );
1700
1701 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1703 assert!(
1704 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1705 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1706 );
1707
1708 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1710 assert!(ALLOWED_MATCHER_3RD_PARTY
1711 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1712 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1713 }
1714
1715 #[test]
1716 fn test_script_allowed_by_default_when_not_blocklisted() {
1717 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1718 nm.set_page_url(
1719 "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1720 );
1721
1722 let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1724 assert!(
1725 !nm.should_block_script_blocklist_only(ok),
1726 "expected non-blocklisted script to be allowed"
1727 );
1728 }
1729
1730 #[test]
1731 fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1732 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1733 nm.set_page_url(
1734 "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1735 );
1736
1737 let bad = "https://cdn.example.net/js/analytics.js";
1739 assert!(
1740 nm.should_block_script_blocklist_only(bad),
1741 "expected analytics.js to be blocklisted"
1742 );
1743 }
1744
1745 #[test]
1746 fn test_allowed_matcher_3rd_party_sanity() {
1747 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1749 assert!(
1750 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1751 "expected Cloudflare challenge script to be allowed"
1752 );
1753
1754 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1756 assert!(
1757 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1758 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1759 );
1760
1761 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1762 assert!(ALLOWED_MATCHER_3RD_PARTY
1763 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1764 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1765 }
1766 #[test]
1767 fn test_dynamic_blacklist_blocks_url() {
1768 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1769 nm.set_page_url("https://example.com/".to_string());
1770
1771 nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1772 assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1773 assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1774
1775 assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1776 }
1777
1778 #[test]
1779 fn test_blacklist_strict_wins_over_whitelist() {
1780 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1781 nm.set_page_url("https://example.com/".to_string());
1782
1783 nm.set_blacklist_patterns(["beacon.min.js"]);
1785 nm.set_whitelist_patterns(["beacon.min.js"]);
1786
1787 nm.set_blacklist_strict(true);
1788
1789 let u = "https://static.cloudflareinsights.com/beacon.min.js";
1790 assert!(nm.is_whitelisted(u));
1791 assert!(nm.is_blacklisted(u));
1792
1793 assert!(nm.blacklist_strict);
1796 }
1797
1798 #[cfg(feature = "adblock")]
1799 fn make_request_paused(
1800 url: &str,
1801 resource_type: chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType,
1802 is_same_site: bool,
1803 ) -> chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused {
1804 use chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused;
1805 use chromiumoxide_cdp::cdp::browser_protocol::network::{
1806 Headers, Request, RequestReferrerPolicy, ResourcePriority,
1807 };
1808
1809 EventRequestPaused {
1810 request_id: chromiumoxide_cdp::cdp::browser_protocol::network::RequestId::from(
1811 "test-req".to_string(),
1812 )
1813 .into(),
1814 request: Request {
1815 url: url.to_string(),
1816 method: "GET".to_string(),
1817 headers: Headers::new(serde_json::Value::Object(Default::default())),
1818 initial_priority: ResourcePriority::Medium,
1819 referrer_policy: RequestReferrerPolicy::NoReferrer,
1820 url_fragment: None,
1821 has_post_data: None,
1822 post_data_entries: None,
1823 mixed_content_type: None,
1824 is_link_preload: None,
1825 trust_token_params: None,
1826 is_same_site: Some(is_same_site),
1827 is_ad_related: None,
1828 },
1829 frame_id: chromiumoxide_cdp::cdp::browser_protocol::page::FrameId::from(
1830 "frame1".to_string(),
1831 ),
1832 resource_type,
1833 response_error_reason: None,
1834 response_status_code: None,
1835 response_status_text: None,
1836 response_headers: None,
1837 network_id: None,
1838 redirected_request_id: None,
1839 }
1840 }
1841
1842 #[cfg(feature = "adblock")]
1843 #[test]
1844 fn test_detect_ad_blocks_known_tracker_scripts() {
1845 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1846
1847 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1848 nm.set_page_url("https://www.wine-searcher.com/".to_string());
1849
1850 let event = make_request_paused(
1851 "https://www.googletagmanager.com/gtm.js?id=GTM-XXXX",
1852 ResourceType::Script,
1853 false,
1854 );
1855
1856 assert!(
1857 nm.detect_ad(&event),
1858 "googletagmanager.com script should be detected as ad"
1859 );
1860 }
1861
1862 #[cfg(feature = "adblock")]
1863 #[test]
1864 fn test_detect_ad_allows_legitimate_scripts() {
1865 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1866
1867 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1868 nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
1869
1870 let event = make_request_paused(
1871 "https://www.mylegitsite-test.com/static/js/app-bundle.js",
1872 ResourceType::Script,
1873 true,
1874 );
1875
1876 assert!(
1877 !nm.detect_ad(&event),
1878 "legitimate first-party app bundle should not be blocked"
1879 );
1880 }
1881
1882 #[cfg(feature = "adblock")]
1883 #[test]
1884 fn test_detect_ad_uses_source_domain() {
1885 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1886
1887 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1888 nm.set_page_url("https://www.wine-searcher.com/some-page".to_string());
1889
1890 assert!(
1891 !nm.document_target_domain.is_empty(),
1892 "document_target_domain should be set after set_page_url"
1893 );
1894
1895 let event = make_request_paused(
1896 "https://www.google-analytics.com/analytics.js",
1897 ResourceType::Script,
1898 false,
1899 );
1900
1901 assert!(
1902 nm.detect_ad(&event),
1903 "google-analytics.com should be blocked as tracker"
1904 );
1905 }
1906
1907 #[cfg(feature = "adblock")]
1908 #[test]
1909 fn test_custom_adblock_engine_takes_precedence() {
1910 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1911
1912 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1913 nm.set_page_url("https://example.com/".to_string());
1914
1915 let mut filter_set = adblock::lists::FilterSet::new(false);
1917 let mut opts = adblock::lists::ParseOptions::default();
1918 opts.rule_types = adblock::lists::RuleTypes::All;
1919 filter_set.add_filters(["||custom-tracker.example.net^"], opts);
1920 let engine = adblock::Engine::from_filter_set(filter_set, true);
1921 nm.set_adblock_engine(std::sync::Arc::new(engine));
1922
1923 let event = make_request_paused(
1924 "https://custom-tracker.example.net/pixel.js",
1925 ResourceType::Script,
1926 false,
1927 );
1928
1929 assert!(
1930 nm.detect_ad(&event),
1931 "custom engine rule should block custom-tracker.example.net"
1932 );
1933 }
1934
1935 #[cfg(feature = "adblock")]
1938 fn run_full_interception(
1939 nm: &mut NetworkManager,
1940 url: &str,
1941 resource_type: chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType,
1942 is_same_site: bool,
1943 ) -> bool {
1944 use super::NetworkEvent;
1945
1946 while nm.poll().is_some() {}
1948
1949 let event = make_request_paused(url, resource_type, is_same_site);
1950 nm.on_fetch_request_paused(&event);
1951
1952 let mut blocked = false;
1954 while let Some(ev) = nm.poll() {
1955 if let NetworkEvent::SendCdpRequest((method, _)) = &ev {
1956 let m: &str = method.as_ref();
1957 if m == "Fetch.fulfillRequest" || m == "Fetch.failRequest" {
1958 blocked = true;
1959 }
1960 }
1961 }
1962 blocked
1963 }
1964
1965 #[cfg(feature = "adblock")]
1968 #[test]
1969 fn test_e2e_tracker_script_blocked() {
1970 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1971
1972 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1973 nm.set_page_url("https://www.wine-searcher.com/".to_string());
1974
1975 assert!(
1976 run_full_interception(
1977 &mut nm,
1978 "https://www.googletagmanager.com/gtm.js?id=GTM-XXXX",
1979 ResourceType::Script,
1980 false,
1981 ),
1982 "GTM script should be blocked through full pipeline"
1983 );
1984 }
1985
1986 #[cfg(feature = "adblock")]
1987 #[test]
1988 fn test_e2e_legitimate_script_allowed() {
1989 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1990
1991 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1992 nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
1993
1994 assert!(
1995 !run_full_interception(
1996 &mut nm,
1997 "https://www.mylegitsite-test.com/static/js/app-bundle.js",
1998 ResourceType::Script,
1999 true,
2000 ),
2001 "legitimate first-party script should be allowed through full pipeline"
2002 );
2003 }
2004
2005 #[cfg(feature = "adblock")]
2006 #[test]
2007 fn test_e2e_analytics_xhr_blocked() {
2008 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2009
2010 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2011 nm.set_page_url("https://example.org/".to_string());
2012
2013 assert!(
2014 run_full_interception(
2015 &mut nm,
2016 "https://www.google-analytics.com/g/collect?v=2&tid=UA-123",
2017 ResourceType::Xhr,
2018 false,
2019 ),
2020 "Google Analytics XHR should be blocked through full pipeline"
2021 );
2022 }
2023
2024 #[cfg(feature = "adblock")]
2025 #[test]
2026 fn test_e2e_whitelisted_overrides_adblock() {
2027 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2028
2029 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2030 nm.set_page_url("https://example.org/".to_string());
2031 nm.set_whitelist_patterns(["googletagmanager.com"]);
2032
2033 assert!(
2035 !run_full_interception(
2036 &mut nm,
2037 "https://www.googletagmanager.com/gtm.js?id=GTM-TEST",
2038 ResourceType::Script,
2039 false,
2040 ),
2041 "whitelisted tracker should be allowed even when adblock would block it"
2042 );
2043 }
2044
2045 #[cfg(feature = "adblock")]
2046 #[test]
2047 fn test_e2e_blacklist_strict_overrides_whitelist() {
2048 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2049
2050 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2051 nm.set_page_url("https://example.org/".to_string());
2052 nm.set_blacklist_patterns(["cdn.example.net/evil.js"]);
2053 nm.set_whitelist_patterns(["cdn.example.net/evil.js"]);
2054 nm.set_blacklist_strict(true);
2055
2056 assert!(
2057 run_full_interception(
2058 &mut nm,
2059 "https://cdn.example.net/evil.js",
2060 ResourceType::Script,
2061 false,
2062 ),
2063 "strict blacklist should win over whitelist"
2064 );
2065 }
2066
2067 #[cfg(feature = "adblock")]
2068 #[test]
2069 fn test_e2e_first_party_document_not_blocked() {
2070 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2071
2072 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2073 nm.set_page_url("https://www.nytimes.com/".to_string());
2074
2075 assert!(
2076 !run_full_interception(
2077 &mut nm,
2078 "https://www.nytimes.com/2024/article.html",
2079 ResourceType::Document,
2080 true,
2081 ),
2082 "first-party document navigation should never be blocked"
2083 );
2084 }
2085
2086 #[cfg(feature = "adblock")]
2087 #[test]
2088 fn test_e2e_custom_engine_blocks_through_pipeline() {
2089 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2090
2091 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2092 nm.set_page_url("https://mysite.com/".to_string());
2093
2094 let mut filter_set = adblock::lists::FilterSet::new(false);
2095 let mut opts = adblock::lists::ParseOptions::default();
2096 opts.rule_types = adblock::lists::RuleTypes::All;
2097 filter_set.add_filters(["||evil-cdn.example.net^$script"], opts);
2098 let engine = adblock::Engine::from_filter_set(filter_set, true);
2099 nm.set_adblock_engine(std::sync::Arc::new(engine));
2100
2101 assert!(
2102 run_full_interception(
2103 &mut nm,
2104 "https://evil-cdn.example.net/tracker.js",
2105 ResourceType::Script,
2106 false,
2107 ),
2108 "custom engine rule should block through full pipeline"
2109 );
2110
2111 assert!(
2113 !run_full_interception(
2114 &mut nm,
2115 "https://mysite.com/app.js",
2116 ResourceType::Script,
2117 true,
2118 ),
2119 "first-party script should still be allowed with custom engine"
2120 );
2121 }
2122
2123 #[cfg(feature = "adblock")]
2124 #[test]
2125 fn test_e2e_ad_image_blocked() {
2126 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2127
2128 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2129 nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
2130
2131 assert!(
2133 run_full_interception(
2134 &mut nm,
2135 "https://googleads.g.doubleclick.net/pagead/viewthroughconversion/123/?random=456",
2136 ResourceType::Image,
2137 false,
2138 ),
2139 "doubleclick ad image/tracking pixel should be blocked"
2140 );
2141
2142 assert!(
2144 !run_full_interception(
2145 &mut nm,
2146 "https://www.mylegitsite-test.com/images/logo.png",
2147 ResourceType::Image,
2148 true,
2149 ),
2150 "legitimate first-party image should not be blocked"
2151 );
2152 }
2153
2154 #[cfg(feature = "adblock")]
2155 #[test]
2156 fn test_e2e_hostname_with_userinfo() {
2157 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2158
2159 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2160 nm.set_page_url("https://example.org/".to_string());
2161
2162 assert!(
2164 run_full_interception(
2165 &mut nm,
2166 "https://user:pass@www.googletagmanager.com/gtm.js?id=GTM-XXXX",
2167 ResourceType::Script,
2168 false,
2169 ),
2170 "tracker URL with userinfo should still be blocked"
2171 );
2172 }
2173
2174 #[test]
2175 fn test_blacklist_non_strict_allows_whitelist_override() {
2176 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2177 nm.set_page_url("https://example.com/".to_string());
2178
2179 nm.set_blacklist_patterns(["beacon.min.js"]);
2180 nm.set_whitelist_patterns(["beacon.min.js"]);
2181
2182 nm.set_blacklist_strict(false);
2183
2184 let u = "https://static.cloudflareinsights.com/beacon.min.js";
2185 assert!(nm.is_blacklisted(u));
2186 assert!(nm.is_whitelisted(u));
2187 assert!(!nm.blacklist_strict);
2188 }
2189
2190 fn make_request_will_be_sent(
2199 request_id: &str,
2200 url: &str,
2201 resource_type: &str,
2202 redirect_from_url: Option<&str>,
2203 ) -> chromiumoxide_cdp::cdp::browser_protocol::network::EventRequestWillBeSent {
2204 let mut v = serde_json::json!({
2205 "requestId": request_id,
2206 "loaderId": "test-loader",
2207 "documentURL": url,
2208 "request": {
2209 "url": url,
2210 "method": "GET",
2211 "headers": {},
2212 "initialPriority": "Medium",
2213 "referrerPolicy": "no-referrer"
2214 },
2215 "timestamp": 0.0,
2216 "wallTime": 0.0,
2217 "initiator": { "type": "other" },
2218 "redirectHasExtraInfo": false,
2219 "type": resource_type,
2220 "frameId": "frame1"
2221 });
2222 if let Some(from) = redirect_from_url {
2223 v["redirectResponse"] = serde_json::json!({
2224 "url": from,
2225 "status": 302,
2226 "statusText": "Found",
2227 "headers": { "Location": url },
2228 "mimeType": "text/html",
2229 "charset": "",
2230 "connectionReused": false,
2231 "connectionId": 0.0,
2232 "encodedDataLength": 0.0,
2233 "securityState": "unknown"
2234 });
2235 }
2236 serde_json::from_value(v).expect("EventRequestWillBeSent should deserialize")
2237 }
2238
2239 fn drain_too_many_redirects(nm: &mut NetworkManager) -> Option<super::HttpRequest> {
2240 while let Some(ev) = nm.poll() {
2241 if let super::NetworkEvent::RequestFailed(req) = ev {
2242 if req.failure_text.as_deref() == Some("net::ERR_TOO_MANY_REDIRECTS") {
2243 return Some(req);
2244 }
2245 }
2246 }
2247 None
2248 }
2249
2250 fn drain_stop_loading(nm: &mut NetworkManager) -> bool {
2251 while let Some(ev) = nm.poll() {
2252 if let super::NetworkEvent::SendCdpRequest((method, _)) = ev {
2253 let m: &str = method.as_ref();
2254 if m == "Page.stopLoading" {
2255 return true;
2256 }
2257 }
2258 }
2259 false
2260 }
2261
2262 #[test]
2263 fn test_max_redirects_none_allows_unlimited_chain() {
2264 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2265 nm.on_request_will_be_sent(&make_request_will_be_sent(
2269 "r1",
2270 "https://example.com/0",
2271 "Document",
2272 None,
2273 ));
2274 for i in 1..10 {
2275 nm.on_request_will_be_sent(&make_request_will_be_sent(
2276 "r1",
2277 &format!("https://example.com/{i}"),
2278 "Document",
2279 Some(&format!("https://example.com/{}", i - 1)),
2280 ));
2281 }
2282
2283 assert!(
2284 drain_too_many_redirects(&mut nm).is_none(),
2285 "no cap set: chain of 10 hops must not emit ERR_TOO_MANY_REDIRECTS"
2286 );
2287 }
2288
2289 #[test]
2290 fn test_max_redirects_caps_document_chain() {
2291 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2292 nm.max_redirects = Some(3);
2293
2294 nm.on_request_will_be_sent(&make_request_will_be_sent(
2297 "r1",
2298 "https://example.com/0",
2299 "Document",
2300 None,
2301 ));
2302 for i in 1..=4 {
2303 nm.on_request_will_be_sent(&make_request_will_be_sent(
2304 "r1",
2305 &format!("https://example.com/{i}"),
2306 "Document",
2307 Some(&format!("https://example.com/{}", i - 1)),
2308 ));
2309 }
2310
2311 let failed = drain_too_many_redirects(&mut nm)
2312 .expect("cap of 3 on a 4-hop chain must emit ERR_TOO_MANY_REDIRECTS");
2313 assert_eq!(
2314 failed.redirect_chain.len(),
2315 4,
2316 "failed request should preserve the full accumulated chain"
2317 );
2318 assert_eq!(
2319 failed.url.as_deref(),
2320 Some("https://example.com/4"),
2321 "failed request url should be the hop that tripped the cap"
2322 );
2323
2324 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2327 nm.max_redirects = Some(3);
2328 nm.on_request_will_be_sent(&make_request_will_be_sent(
2329 "r2",
2330 "https://example.com/0",
2331 "Document",
2332 None,
2333 ));
2334 for i in 1..=4 {
2335 nm.on_request_will_be_sent(&make_request_will_be_sent(
2336 "r2",
2337 &format!("https://example.com/{i}"),
2338 "Document",
2339 Some(&format!("https://example.com/{}", i - 1)),
2340 ));
2341 }
2342 assert!(
2343 drain_stop_loading(&mut nm),
2344 "cap hit must dispatch Page.stopLoading to abort navigation"
2345 );
2346 }
2347
2348 #[test]
2349 fn test_max_redirects_ignores_subresources() {
2350 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2351 nm.max_redirects = Some(2);
2352
2353 nm.on_request_will_be_sent(&make_request_will_be_sent(
2355 "s1",
2356 "https://cdn.example.com/0.js",
2357 "Script",
2358 None,
2359 ));
2360 for i in 1..=5 {
2361 nm.on_request_will_be_sent(&make_request_will_be_sent(
2362 "s1",
2363 &format!("https://cdn.example.com/{i}.js"),
2364 "Script",
2365 Some(&format!("https://cdn.example.com/{}.js", i - 1)),
2366 ));
2367 }
2368
2369 assert!(
2370 drain_too_many_redirects(&mut nm).is_none(),
2371 "sub-resource redirect chains must never be capped"
2372 );
2373 }
2374}