1#[cfg(any(feature = "adblock", feature = "firewall"))]
2use super::blockers::block_websites::block_ads;
3use super::blockers::{
4 block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
5 xhr::IGNORE_XHR_ASSETS,
6};
7use crate::auth::Credentials;
8#[cfg(feature = "_cache")]
9use crate::cache::BasicCachePolicy;
10use crate::cmd::CommandChain;
11use crate::handler::http::HttpRequest;
12use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
13use aho_corasick::AhoCorasick;
14use case_insensitive_string::CaseInsensitiveString;
15use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
16use chromiumoxide_cdp::cdp::browser_protocol::network::{
17 EmulateNetworkConditionsByRuleParams, EventLoadingFailed, EventLoadingFinished,
18 EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
19 InterceptionId, NetworkConditions, RequestId, ResourceType, Response, SetCacheDisabledParams,
20 SetExtraHttpHeadersParams,
21};
22use chromiumoxide_cdp::cdp::browser_protocol::{
23 fetch::{
24 self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
25 ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
26 },
27 network::SetBypassServiceWorkerParams,
28};
29use chromiumoxide_cdp::cdp::browser_protocol::{
30 network::EnableParams, security::SetIgnoreCertificateErrorsParams,
31};
32use chromiumoxide_types::{Command, Method, MethodId};
33use hashbrown::{HashMap, HashSet};
34use lazy_static::lazy_static;
35use reqwest::header::PROXY_AUTHORIZATION;
36use spider_network_blocker::intercept_manager::NetworkInterceptManager;
37pub use spider_network_blocker::scripts::{
38 URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
39};
40use std::borrow::Cow;
41use std::collections::VecDeque;
42use std::time::{Duration, Instant};
43
44lazy_static! {
45 static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
47 "jquery", "angular",
49 "react", "vue", "bootstrap",
52 "d3",
53 "lodash",
54 "ajax",
55 "application",
56 "app", "main",
58 "index",
59 "bundle",
60 "vendor",
61 "runtime",
62 "polyfill",
63 "scripts",
64 "es2015.",
65 "es2020.",
66 "webpack",
67 "captcha",
68 "client",
69 "/cdn-cgi/challenge-platform/",
70 "/wp-content/js/", "https://m.stripe.network/",
73 "https://challenges.cloudflare.com/",
74 "https://www.google.com/recaptcha/",
75 "https://google.com/recaptcha/api.js",
76 "https://www.gstatic.com/recaptcha/",
77 "https://captcha.px-cloud.net/",
78 "https://geo.captcha-delivery.com/",
79 "https://api.leminnow.com/captcha/",
80 "https://cdn.auth0.com/js/lock/",
81 "https://captcha.gtimg.com",
82 "https://client-api.arkoselabs.com/",
83 "https://www.capy.me/puzzle/",
84 "https://newassets.hcaptcha.com/",
85 "https://cdn.auth0.com/client",
86 "https://js.stripe.com/",
87 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-"
90 ];
91
92 pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
97
98 static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
100 "https://m.stripe.network/",
102 "https://challenges.cloudflare.com/",
103 "https://js.stripe.com/",
104 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-",
107 "https://ct.captcha-delivery.com/",
108 "https://geo.captcha-delivery.com/",
109 "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", "https://cdn.auth0.com/client",
111 "https://captcha.px-cloud.net/",
112 "https://www.capy.me/puzzle/",
113 "https://www.gstatic.com/recaptcha/",
114 "https://google.com/recaptcha/",
115 "https://www.google.com/recaptcha/",
116 "https://www.recaptcha.net/recaptcha/",
117 "https://js.hcaptcha.com/1/api.js",
118 "https://hcaptcha.com/1/api.js",
119 "https://js.datadome.co/tags.js",
120 "https://api-js.datadome.co/",
121 "https://client.perimeterx.net/",
122 "https://captcha.px-cdn.net/",
123 "https://newassets.hcaptcha.com/",
124 "https://captcha.px-cloud.net/",
125 "https://s.perimeterx.net/",
126 "https://api.leminnow.com/captcha/",
127 "https://client-api.arkoselabs.com/",
128 "https://static.geetest.com/v4/gt4.js",
129 "https://static.geetest.com/",
130 "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
131 "https://cdn.perfdrive.com/aperture/",
132 "https://assets.queue-it.net/",
133 "discourse-cdn.com/",
134 "hcaptcha.com",
135 "/cdn-cgi/challenge-platform/",
136 "/_Incapsula_Resource"
137 ];
138
139 pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
141
142 pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
144 phf::phf_set! {
145 "_astro/", "_app/immutable"
147 }
148 };
149
150 pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
152 "application/pdf",
153 "application/zip",
154 "application/x-rar-compressed",
155 "application/x-tar",
156 "image/png",
157 "image/jpeg",
158 "image/gif",
159 "image/bmp",
160 "image/webp",
161 "image/svg+xml",
162 "video/mp4",
163 "video/x-msvideo",
164 "video/x-matroska",
165 "video/webm",
166 "audio/mpeg",
167 "audio/ogg",
168 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
169 "application/vnd.ms-excel",
170 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
171 "application/vnd.ms-powerpoint",
172 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
173 "application/x-7z-compressed",
174 "application/x-rpm",
175 "application/x-shockwave-flash",
176 "application/rtf",
177 };
178
179 pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
181 "Image",
182 "Media",
183 "Font"
184 };
185
186 pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
188 "CspViolationReport",
189 "Ping",
190 };
191
192 pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
194
195 pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
197 let enable = EnableParams::default();
198
199 if let Ok(c) = serde_json::to_value(&enable) {
200 vec![(enable.identifier(), c)]
201 } else {
202 vec![]
203 }
204 };
205
206 pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
208 let enable = EnableParams::default();
209 let mut v = vec![];
210 if let Ok(c) = serde_json::to_value(&enable) {
211 v.push((enable.identifier(), c));
212 }
213 let ignore = SetIgnoreCertificateErrorsParams::new(true);
214 if let Ok(ignored) = serde_json::to_value(&ignore) {
215 v.push((ignore.identifier(), ignored));
216 }
217
218 v
219 };
220
221 pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
223 fetch::EnableParams::builder()
224 .handle_auth_requests(true)
225 .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
226 .build()
227 };
228}
229
230pub(crate) fn is_redirect_status(status: i64) -> bool {
232 matches!(status, 301 | 302 | 303 | 307 | 308)
233}
234
235const STALE_BUFFER_SECS: u64 = 30;
240
241const STALE_REQUEST_SECS: u64 = 120;
247
248#[cfg(feature = "adblock")]
250pub struct AdblockEngine(std::sync::Arc<adblock::Engine>);
251
252#[cfg(feature = "adblock")]
253impl std::fmt::Debug for AdblockEngine {
254 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
255 f.debug_struct("AdblockEngine").finish()
256 }
257}
258
259#[cfg(feature = "adblock")]
260impl std::ops::Deref for AdblockEngine {
261 type Target = adblock::Engine;
262 fn deref(&self) -> &Self::Target {
263 &self.0
264 }
265}
266
267#[derive(Debug)]
268pub struct NetworkManager {
270 queued_events: VecDeque<NetworkEvent>,
276 ignore_httpserrors: bool,
281 requests: HashMap<RequestId, HttpRequest>,
286 requests_will_be_sent: HashMap<RequestId, (EventRequestWillBeSent, Instant)>,
293 extra_headers: std::collections::HashMap<String, String>,
298 request_id_to_interception_id: HashMap<RequestId, (InterceptionId, Instant)>,
305 user_cache_disabled: bool,
310 attempted_authentications: HashSet<RequestId>,
316 credentials: Option<Credentials>,
321 pub(crate) user_request_interception_enabled: bool,
330 block_all: bool,
337 pub(crate) protocol_request_interception_enabled: bool,
343 offline: bool,
345 pub request_timeout: Duration,
347 pub ignore_visuals: bool,
350 pub block_stylesheets: bool,
352 pub block_javascript: bool,
357 pub block_analytics: bool,
359 pub block_prefetch: bool,
361 pub only_html: bool,
363 pub xml_document: bool,
365 pub intercept_manager: NetworkInterceptManager,
367 pub document_reload_tracker: u8,
369 pub document_target_url: String,
371 pub document_target_domain: String,
373 pub max_bytes_allowed: Option<u64>,
375 #[cfg(feature = "_cache")]
376 pub cache_site_key: Option<String>,
378 #[cfg(feature = "_cache")]
380 pub cache_policy: Option<BasicCachePolicy>,
381 whitelist_patterns: Vec<String>,
383 whitelist_matcher: Option<AhoCorasick>,
385 blacklist_patterns: Vec<String>,
387 blacklist_matcher: Option<AhoCorasick>,
389 blacklist_strict: bool,
391 #[cfg(feature = "adblock")]
394 adblock_engine: Option<AdblockEngine>,
395}
396
397impl NetworkManager {
398 pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
400 Self {
401 queued_events: Default::default(),
402 ignore_httpserrors,
403 requests: Default::default(),
404 requests_will_be_sent: Default::default(),
405 extra_headers: Default::default(),
406 request_id_to_interception_id: Default::default(),
407 user_cache_disabled: false,
408 attempted_authentications: Default::default(),
409 credentials: None,
410 block_all: false,
411 user_request_interception_enabled: false,
412 protocol_request_interception_enabled: false,
413 offline: false,
414 request_timeout,
415 ignore_visuals: false,
416 block_javascript: false,
417 block_stylesheets: false,
418 block_prefetch: true,
419 block_analytics: true,
420 only_html: false,
421 xml_document: false,
422 intercept_manager: NetworkInterceptManager::Unknown,
423 document_reload_tracker: 0,
424 document_target_url: String::new(),
425 document_target_domain: String::new(),
426 whitelist_patterns: Vec::new(),
427 whitelist_matcher: None,
428 blacklist_patterns: Vec::new(),
429 blacklist_matcher: None,
430 blacklist_strict: true,
431 max_bytes_allowed: None,
432 #[cfg(feature = "_cache")]
433 cache_site_key: None,
434 #[cfg(feature = "_cache")]
435 cache_policy: None,
436 #[cfg(feature = "adblock")]
437 adblock_engine: None,
438 }
439 }
440
441 #[cfg(feature = "adblock")]
443 pub fn set_adblock_engine(&mut self, engine: std::sync::Arc<adblock::Engine>) {
444 self.adblock_engine = Some(AdblockEngine(engine));
445 }
446
447 pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
449 where
450 I: IntoIterator<Item = S>,
451 S: Into<String>,
452 {
453 self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
454 self.rebuild_whitelist_matcher();
455 }
456
457 pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
459 where
460 I: IntoIterator<Item = S>,
461 S: Into<String>,
462 {
463 self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
464 self.rebuild_blacklist_matcher();
465 }
466
467 pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
469 self.blacklist_patterns.push(pattern.into());
470 self.rebuild_blacklist_matcher();
471 }
472
473 pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
475 where
476 I: IntoIterator<Item = S>,
477 S: Into<String>,
478 {
479 self.blacklist_patterns
480 .extend(patterns.into_iter().map(Into::into));
481 self.rebuild_blacklist_matcher();
482 }
483
484 pub fn clear_blacklist(&mut self) {
486 self.blacklist_patterns.clear();
487 self.blacklist_matcher = None;
488 }
489
490 pub fn set_blacklist_strict(&mut self, strict: bool) {
492 self.blacklist_strict = strict;
493 }
494
495 #[inline]
496 fn rebuild_blacklist_matcher(&mut self) {
497 if self.blacklist_patterns.is_empty() {
498 self.blacklist_matcher = None;
499 return;
500 }
501
502 self.blacklist_matcher =
503 AhoCorasick::new(self.blacklist_patterns.iter().map(|s| s.as_str())).ok();
504 }
505
506 #[inline]
507 fn is_blacklisted(&self, url: &str) -> bool {
508 self.blacklist_matcher
509 .as_ref()
510 .map(|m| m.is_match(url))
511 .unwrap_or(false)
512 }
513
514 pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
516 self.whitelist_patterns.push(pattern.into());
517 self.rebuild_whitelist_matcher();
518 }
519
520 pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
522 where
523 I: IntoIterator<Item = S>,
524 S: Into<String>,
525 {
526 self.whitelist_patterns
527 .extend(patterns.into_iter().map(Into::into));
528 self.rebuild_whitelist_matcher();
529 }
530
531 #[inline]
532 fn rebuild_whitelist_matcher(&mut self) {
533 if self.whitelist_patterns.is_empty() {
534 self.whitelist_matcher = None;
535 return;
536 }
537
538 self.whitelist_matcher =
540 AhoCorasick::new(self.whitelist_patterns.iter().map(|s| s.as_str())).ok();
541 }
542
543 #[inline]
544 fn is_whitelisted(&self, url: &str) -> bool {
545 self.whitelist_matcher
546 .as_ref()
547 .map(|m| m.is_match(url))
548 .unwrap_or(false)
549 }
550
551 pub fn init_commands(&self) -> CommandChain {
553 let cmds = if self.ignore_httpserrors {
554 INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
555 } else {
556 INIT_CHAIN.clone()
557 };
558 CommandChain::new(cmds, self.request_timeout)
559 }
560
561 pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
563 let method = cmd.identifier();
564 if let Ok(params) = serde_json::to_value(cmd) {
565 self.queued_events
566 .push_back(NetworkEvent::SendCdpRequest((method, params)));
567 }
568 }
569
570 pub fn poll(&mut self) -> Option<NetworkEvent> {
572 self.queued_events.pop_front()
573 }
574
575 pub fn evict_stale_entries(&mut self, now: Instant) {
580 let cutoff = now - Duration::from_secs(STALE_BUFFER_SECS);
581
582 self.requests_will_be_sent.retain(|_, (_, ts)| *ts > cutoff);
583 self.request_id_to_interception_id
584 .retain(|_, (_, ts)| *ts > cutoff);
585
586 let request_cutoff = now - Duration::from_secs(STALE_REQUEST_SECS);
591 self.requests
592 .retain(|_, req| req.created_at > request_cutoff);
593
594 if !self.attempted_authentications.is_empty() {
599 let live: HashSet<&str> = self
600 .requests
601 .values()
602 .filter_map(|r| r.interception_id.as_ref().map(|id| id.as_ref()))
603 .collect();
604 self.attempted_authentications
605 .retain(|id| live.contains(id.as_ref()));
606 }
607 }
608
609 pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
611 &self.extra_headers
612 }
613
614 pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
616 self.extra_headers = headers;
617 self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
618 self.extra_headers.remove("Proxy-Authorization");
619 if !self.extra_headers.is_empty() {
620 if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
621 self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
622 }
623 }
624 }
625
626 pub fn set_service_worker_enabled(&mut self, bypass: bool) {
627 self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
628 }
629
630 pub fn set_block_all(&mut self, block_all: bool) {
631 self.block_all = block_all;
632 }
633
634 pub fn set_request_interception(&mut self, enabled: bool) {
635 self.user_request_interception_enabled = enabled;
636 self.update_protocol_request_interception();
637 }
638
639 pub fn set_cache_enabled(&mut self, enabled: bool) {
640 let run = self.user_cache_disabled == enabled;
641 self.user_cache_disabled = !enabled;
642 if run {
643 self.update_protocol_cache_disabled();
644 }
645 }
646
647 pub fn enable_request_intercept(&mut self) {
649 self.protocol_request_interception_enabled = true;
650 }
651
652 pub fn disable_request_intercept(&mut self) {
654 self.protocol_request_interception_enabled = false;
655 }
656
657 #[cfg(feature = "_cache")]
659 pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
660 self.cache_site_key = cache_site_key;
661 }
662
663 #[cfg(feature = "_cache")]
665 pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
666 self.cache_policy = cache_policy;
667 }
668
669 pub fn update_protocol_cache_disabled(&mut self) {
670 self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
671 }
672
673 pub fn authenticate(&mut self, credentials: Credentials) {
674 self.credentials = Some(credentials);
675 self.update_protocol_request_interception();
676 self.protocol_request_interception_enabled = true;
677 }
678
679 fn update_protocol_request_interception(&mut self) {
680 let enabled = self.user_request_interception_enabled || self.credentials.is_some();
681
682 if enabled == self.protocol_request_interception_enabled {
683 return;
684 }
685
686 if enabled {
687 self.push_cdp_request(ENABLE_FETCH.clone())
688 } else {
689 self.push_cdp_request(DisableParams::default())
690 }
691 }
692
693 #[inline]
696 fn should_block_script_blocklist_only(&self, url: &str) -> bool {
697 let block_analytics = self.block_analytics;
699
700 if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
702 {
703 return true;
704 }
705
706 if crate::handler::blockers::block_websites::block_website(url) {
708 return true;
709 }
710
711 if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
718 let p_slash = Self::strip_query_fragment(path_with_slash);
720 let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
721
722 let base = match p_slash.rsplit('/').next() {
724 Some(b) => b,
725 None => p_slash,
726 };
727
728 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
731 return true;
732 }
733 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
734 return true;
735 }
736 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
737 return true;
738 }
739
740 if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
743 return true;
744 }
745
746 if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
748 return true;
749 }
750 }
751
752 false
753 }
754
755 #[inline]
760 fn url_path_with_leading_slash(url: &str) -> Option<&str> {
761 let bytes = url.as_bytes();
763 let idx = memchr::memmem::find(bytes, b"//")?;
764 let after_slashes = idx + 2;
765
766 let slash_rel = memchr::memchr(b'/', &bytes[after_slashes..])?;
768 let slash_idx = after_slashes + slash_rel;
769
770 if slash_idx < url.len() {
771 Some(&url[slash_idx..])
772 } else {
773 None
774 }
775 }
776
777 #[inline]
782 fn strip_query_fragment(s: &str) -> &str {
783 match memchr::memchr2(b'?', b'#', s.as_bytes()) {
784 Some(i) => &s[..i],
785 None => s,
786 }
787 }
788
789 #[inline]
791 fn skip_xhr(
792 &self,
793 skip_networking: bool,
794 event: &EventRequestPaused,
795 network_event: bool,
796 ) -> bool {
797 if !skip_networking && network_event {
799 let request_url = event.request.url.as_str();
800
801 let skip_analytics =
803 self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
804
805 if skip_analytics {
806 true
807 } else if self.block_stylesheets || self.ignore_visuals {
808 let block_css = self.block_stylesheets;
809 let block_media = self.ignore_visuals;
810
811 let mut block_request = false;
812
813 if let Some(position) = memchr::memrchr(b'.', request_url.as_bytes()) {
814 let hlen = request_url.len();
815 let has_asset = hlen - position;
816
817 if has_asset >= 3 {
818 let next_position = position + 1;
819
820 if block_media
821 && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
822 &request_url[next_position..].into(),
823 )
824 {
825 block_request = true;
826 } else if block_css {
827 block_request = CaseInsensitiveString::from(
828 &request_url.as_bytes()[next_position..],
829 )
830 .contains(&**CSS_EXTENSION)
831 }
832 }
833 }
834
835 if !block_request {
836 block_request = ignore_script_xhr_media(request_url);
837 }
838
839 block_request
840 } else {
841 skip_networking
842 }
843 } else {
844 skip_networking
845 }
846 }
847
848 #[cfg(feature = "adblock")]
849 #[inline]
850 fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
852 if skip_networking {
853 true
854 } else {
855 block_ads(&event.request.url) || self.detect_ad(event)
856 }
857 }
858
859 #[cfg(not(feature = "adblock"))]
861 #[inline]
862 fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
863 use crate::handler::blockers::block_websites::block_ads;
864 if skip_networking {
865 true
866 } else {
867 block_ads(&event.request.url)
868 }
869 }
870
871 #[inline]
872 fn fail_request_blocked(
874 &mut self,
875 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
876 ) {
877 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
878 request_id.clone(),
879 chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
880 );
881 self.push_cdp_request(params);
882 }
883
884 #[inline]
885 fn fulfill_request_empty_200(
887 &mut self,
888 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
889 ) {
890 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
891 request_id.clone(),
892 200,
893 );
894 self.push_cdp_request(params);
895 }
896
897 #[cfg(feature = "_cache")]
898 #[inline]
899 fn fulfill_request_from_cache(
903 &mut self,
904 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
905 body: &[u8],
906 headers: &std::collections::HashMap<String, String>,
907 status: i64,
908 ) {
909 use crate::cdp::browser_protocol::fetch::HeaderEntry;
910 use crate::handler::network::fetch::FulfillRequestParams;
911 use base64::Engine;
912
913 let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
914
915 for (k, v) in headers.iter() {
916 resp_headers.push(HeaderEntry {
917 name: k.clone().into(),
918 value: v.clone().into(),
919 });
920 }
921
922 let mut params = FulfillRequestParams::new(request_id.clone(), status);
923
924 params.body = Some(
926 base64::engine::general_purpose::STANDARD
927 .encode(body)
928 .into(),
929 );
930
931 params.response_headers = Some(resp_headers);
932
933 self.push_cdp_request(params);
934 }
935
936 #[inline]
937 fn continue_request_with_url(
939 &mut self,
940 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
941 url: Option<&str>,
942 intercept_response: bool,
943 ) {
944 let mut params = ContinueRequestParams::new(request_id.clone());
945 if let Some(url) = url {
946 params.url = Some(url.to_string());
947 params.intercept_response = Some(intercept_response);
948 }
949 self.push_cdp_request(params);
950 }
951
952 #[inline]
954 pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
955 if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
956 return;
957 }
958
959 if self.block_all {
960 tracing::debug!(
961 "Blocked (block_all): {:?} - {}",
962 event.resource_type,
963 event.request.url
964 );
965 return self.fail_request_blocked(&event.request_id);
966 }
967
968 if let Some(network_id) = event.network_id.as_ref() {
969 if let Some((request_will_be_sent, _)) =
970 self.requests_will_be_sent.remove(network_id.as_ref())
971 {
972 self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
973 } else {
974 self.request_id_to_interception_id.insert(
975 network_id.clone(),
976 (event.request_id.clone().into(), Instant::now()),
977 );
978 }
979 }
980
981 let javascript_resource = event.resource_type == ResourceType::Script;
983 let document_resource = event.resource_type == ResourceType::Document;
984 let network_resource =
985 !document_resource && crate::utils::is_data_resource(&event.resource_type);
986
987 let mut skip_networking =
989 self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(event.resource_type.as_ref());
990
991 if event.resource_type == ResourceType::Prefetch && !self.block_prefetch {
992 skip_networking = true;
993 }
994
995 if !skip_networking {
997 skip_networking = self.document_reload_tracker >= 3;
998 }
999
1000 let (current_url_cow, had_replacer) =
1002 self.handle_document_replacement_and_tracking(event, document_resource);
1003
1004 let current_url: &str = current_url_cow.as_ref();
1005
1006 let blacklisted = self.is_blacklisted(current_url);
1007
1008 if !self.blacklist_strict && blacklisted {
1009 skip_networking = true;
1010 }
1011
1012 if !skip_networking {
1013 if self.xml_document && current_url.ends_with(".xsl") {
1015 skip_networking = false;
1016 } else {
1017 skip_networking = self.should_skip_for_visuals_and_basic(&event.resource_type);
1018 }
1019 }
1020
1021 skip_networking = self.detect_ad_if_enabled(event, skip_networking);
1022
1023 if !skip_networking
1025 && self.block_javascript
1026 && (self.only_html || self.ignore_visuals)
1027 && (javascript_resource
1028 || document_resource
1029 || event.resource_type == ResourceType::Stylesheet
1030 || event.resource_type == ResourceType::Image)
1031 {
1032 skip_networking = ignore_script_embedded(current_url);
1033 }
1034
1035 if !skip_networking && javascript_resource {
1038 skip_networking = self.should_block_script_blocklist_only(current_url);
1039 }
1040
1041 skip_networking = self.skip_xhr(skip_networking, event, network_resource);
1043
1044 if !skip_networking && (javascript_resource || network_resource || document_resource) {
1046 skip_networking = self.intercept_manager.intercept_detection(
1047 current_url,
1048 self.ignore_visuals,
1049 network_resource,
1050 );
1051 }
1052
1053 if !skip_networking && (javascript_resource || network_resource) {
1055 skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
1056 }
1057
1058 if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
1061 {
1062 skip_networking = false;
1063 }
1064
1065 if skip_networking && self.is_whitelisted(current_url) {
1067 skip_networking = false;
1068 }
1069
1070 if self.blacklist_strict && blacklisted {
1071 skip_networking = true;
1072 }
1073
1074 if skip_networking {
1075 tracing::debug!("Blocked: {:?} - {}", event.resource_type, current_url);
1076 self.fulfill_request_empty_200(&event.request_id);
1077 } else {
1078 #[cfg(feature = "_cache")]
1079 {
1080 if let (Some(policy), Some(cache_site_key)) =
1081 (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
1082 {
1083 let current_url = format!("{}:{}", event.request.method, ¤t_url);
1084
1085 if let Some((res, cache_policy)) =
1086 crate::cache::remote::get_session_cache_item(cache_site_key, ¤t_url)
1087 {
1088 if policy.allows_cached(&cache_policy) {
1089 tracing::debug!(
1090 "Remote Cached: {:?} - {}",
1091 &event.resource_type,
1092 ¤t_url
1093 );
1094 let flat_headers = crate::http::headers_from_multi(&res.headers);
1095 return self.fulfill_request_from_cache(
1096 &event.request_id,
1097 &res.body,
1098 &flat_headers,
1099 res.status as i64,
1100 );
1101 }
1102 }
1103 }
1104 }
1105
1106 tracing::debug!("Allowed: {:?} - {}", event.resource_type, current_url);
1108 self.continue_request_with_url(
1109 &event.request_id,
1110 if had_replacer {
1111 Some(current_url)
1112 } else {
1113 None
1114 },
1115 !had_replacer,
1116 );
1117 }
1118 }
1119
1120 #[inline]
1126 fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1127 (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1128 || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1129 }
1130
1131 pub fn has_target_domain(&self) -> bool {
1133 !self.document_target_url.is_empty()
1134 }
1135
1136 pub fn set_page_url(&mut self, page_target_url: String) {
1138 let host_base = host_and_rest(&page_target_url)
1139 .map(|(h, _)| base_domain_from_host(h))
1140 .unwrap_or("");
1141
1142 self.document_target_domain = host_base.to_string();
1143 self.document_target_url = page_target_url;
1144 }
1145
1146 pub fn clear_target_domain(&mut self) {
1148 self.document_reload_tracker = 0;
1149 self.document_target_url = Default::default();
1150 self.document_target_domain = Default::default();
1151 }
1152
1153 #[inline]
1161 fn handle_document_replacement_and_tracking<'a>(
1162 &mut self,
1163 event: &'a EventRequestPaused,
1164 document_resource: bool,
1165 ) -> (Cow<'a, str>, bool) {
1166 let mut replacer: Option<String> = None;
1167 let current_url = event.request.url.as_str();
1168
1169 if document_resource {
1170 if self.document_target_url == current_url {
1171 self.document_reload_tracker += 1;
1172 } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1173 {
1174 let (http_document_replacement, mut https_document_replacement) =
1175 if self.document_target_url.starts_with("http://") {
1176 (
1177 self.document_target_url.replacen("http://", "http//", 1),
1178 self.document_target_url.replacen("http://", "https://", 1),
1179 )
1180 } else {
1181 (
1182 self.document_target_url.replacen("https://", "https//", 1),
1183 self.document_target_url.replacen("https://", "http://", 1),
1184 )
1185 };
1186
1187 let trailing = https_document_replacement.ends_with('/');
1189 if trailing {
1190 https_document_replacement.pop();
1191 }
1192 if https_document_replacement.ends_with('/') {
1193 https_document_replacement.pop();
1194 }
1195
1196 let redirect_mask = format!(
1197 "{}{}",
1198 https_document_replacement, http_document_replacement
1199 );
1200
1201 if current_url == redirect_mask {
1202 replacer = Some(if trailing {
1203 format!("{}/", https_document_replacement)
1204 } else {
1205 https_document_replacement
1206 });
1207 }
1208 }
1209
1210 if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1211 self.xml_document = true;
1212 }
1213
1214 self.document_target_url = event.request.url.clone();
1216 self.document_target_domain = host_and_rest(&self.document_target_url)
1217 .map(|(h, _)| base_domain_from_host(h).to_string())
1218 .unwrap_or_default();
1219 }
1220
1221 let current_url_cow = match replacer {
1222 Some(r) => Cow::Owned(r),
1223 None => Cow::Borrowed(event.request.url.as_str()),
1224 };
1225
1226 let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1227 (current_url_cow, had_replacer)
1228 }
1229
1230 #[cfg(feature = "adblock")]
1234 pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1235 use adblock::{
1236 lists::{FilterSet, ParseOptions, RuleTypes},
1237 Engine,
1238 };
1239
1240 lazy_static::lazy_static! {
1241 static ref AD_ENGINE: Engine = {
1242 let mut filter_set = FilterSet::new(false);
1243 let mut rules = ParseOptions::default();
1244 rules.rule_types = RuleTypes::All;
1245
1246 filter_set.add_filters(
1247 &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1248 rules.clone(),
1249 );
1250
1251 #[cfg(feature = "adblock_easylist")]
1254 {
1255 static EASYLIST: &str = include_str!(concat!(env!("OUT_DIR"), "/easylist.txt"));
1256 static EASYPRIVACY: &str = include_str!(concat!(env!("OUT_DIR"), "/easyprivacy.txt"));
1257
1258 if !EASYLIST.is_empty() {
1259 filter_set.add_filter_list(EASYLIST, rules.clone());
1260 }
1261 if !EASYPRIVACY.is_empty() {
1262 filter_set.add_filter_list(EASYPRIVACY, rules);
1263 }
1264 }
1265
1266 Engine::from_filter_set(filter_set, true)
1267 };
1268 }
1269
1270 let blockable = event.resource_type == ResourceType::Script
1271 || event.resource_type == ResourceType::Image
1272 || event.resource_type == ResourceType::Media
1273 || event.resource_type == ResourceType::Stylesheet
1274 || event.resource_type == ResourceType::Document
1275 || event.resource_type == ResourceType::Fetch
1276 || event.resource_type == ResourceType::Xhr;
1277
1278 if !blockable {
1279 return false;
1280 }
1281
1282 let u = &event.request.url;
1283
1284 let source_domain = if self.document_target_domain.is_empty() {
1285 "example.com"
1286 } else {
1287 &self.document_target_domain
1288 };
1289
1290 let hostname = u
1293 .strip_prefix("https://")
1294 .or_else(|| u.strip_prefix("http://"))
1295 .and_then(|rest| rest.split('/').next())
1296 .map(
1298 |authority| match memchr::memrchr(b'@', authority.as_bytes()) {
1299 Some(i) => &authority[i + 1..],
1300 None => authority,
1301 },
1302 )
1303 .and_then(|host_port| host_port.split(':').next())
1305 .unwrap_or(source_domain);
1306
1307 let resource_type_str = match event.resource_type {
1308 ResourceType::Script => "script",
1309 ResourceType::Image => "image",
1310 ResourceType::Media => "media",
1311 ResourceType::Stylesheet => "stylesheet",
1312 ResourceType::Document => "document",
1313 ResourceType::Fetch => "fetch",
1314 ResourceType::Xhr => "xhr",
1315 _ => "other",
1316 };
1317
1318 let request = adblock::request::Request::preparsed(
1319 u,
1320 hostname,
1321 source_domain,
1322 resource_type_str,
1323 !event.request.is_same_site.unwrap_or_default(),
1324 );
1325
1326 let engine: &Engine = match self.adblock_engine.as_ref() {
1327 Some(custom) => custom,
1328 None => &AD_ENGINE,
1329 };
1330
1331 engine.check_network_request(&request).matched
1332 }
1333
1334 pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1335 let response = if self
1336 .attempted_authentications
1337 .contains(event.request_id.as_ref())
1338 {
1339 AuthChallengeResponseResponse::CancelAuth
1340 } else if self.credentials.is_some() {
1341 self.attempted_authentications
1342 .insert(event.request_id.clone().into());
1343 AuthChallengeResponseResponse::ProvideCredentials
1344 } else {
1345 AuthChallengeResponseResponse::Default
1346 };
1347
1348 let mut auth = AuthChallengeResponse::new(response);
1349 if let Some(creds) = self.credentials.clone() {
1350 auth.username = Some(creds.username);
1351 auth.password = Some(creds.password);
1352 }
1353 self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1354 }
1355
1356 pub fn set_offline_mode(&mut self, value: bool) {
1358 if self.offline == value {
1359 return;
1360 }
1361 self.offline = value;
1362 if let Ok(condition) = NetworkConditions::builder()
1363 .url_pattern("")
1364 .latency(0)
1365 .download_throughput(-1.)
1366 .upload_throughput(-1.)
1367 .build()
1368 {
1369 if let Ok(network) = EmulateNetworkConditionsByRuleParams::builder()
1370 .offline(self.offline)
1371 .matched_network_condition(condition)
1372 .build()
1373 {
1374 self.push_cdp_request(network);
1375 }
1376 }
1377 }
1378
1379 pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1381 if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1382 if let Some((interception_id, _)) = self
1383 .request_id_to_interception_id
1384 .remove(event.request_id.as_ref())
1385 {
1386 self.on_request(event, Some(interception_id));
1387 } else {
1388 self.requests_will_be_sent
1389 .insert(event.request_id.clone(), (event.clone(), Instant::now()));
1390 }
1391 } else {
1392 self.on_request(event, None);
1393 }
1394 }
1395
1396 pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1398 if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1399 request.from_memory_cache = true;
1400 }
1401 }
1402
1403 pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1405 let mut request_failed = false;
1406
1407 let mut deducted: u64 = 0;
1409
1410 if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1411 let before = *max_bytes;
1412
1413 let received_bytes: u64 = event.response.encoded_data_length as u64;
1415
1416 let content_length: Option<u64> = event
1418 .response
1419 .headers
1420 .inner()
1421 .get("content-length")
1422 .and_then(|v| v.as_str())
1423 .and_then(|s| s.trim().parse::<u64>().ok());
1424
1425 *max_bytes = max_bytes.saturating_sub(received_bytes);
1427
1428 if let Some(cl) = content_length {
1430 if cl > *max_bytes {
1431 *max_bytes = 0;
1432 }
1433 }
1434
1435 request_failed = *max_bytes == 0;
1436
1437 deducted = before.saturating_sub(*max_bytes);
1439 }
1440
1441 if deducted > 0 {
1443 self.queued_events
1444 .push_back(NetworkEvent::BytesConsumed(deducted));
1445 }
1446
1447 if request_failed && self.max_bytes_allowed.is_some() {
1449 self.set_block_all(true);
1450 }
1451
1452 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1453 request.set_response(event.response.clone());
1454 self.queued_events.push_back(if request_failed {
1455 NetworkEvent::RequestFailed(request)
1456 } else {
1457 NetworkEvent::RequestFinished(request)
1458 });
1459 }
1460 }
1461
1462 pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1464 if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1465 if let Some(interception_id) = request.interception_id.as_ref() {
1466 self.attempted_authentications
1467 .remove(interception_id.as_ref());
1468 }
1469 self.queued_events
1470 .push_back(NetworkEvent::RequestFinished(request));
1471 }
1472 }
1473
1474 pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1476 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1477 request.failure_text = Some(event.error_text.clone());
1478 if let Some(interception_id) = request.interception_id.as_ref() {
1479 self.attempted_authentications
1480 .remove(interception_id.as_ref());
1481 }
1482 self.queued_events
1483 .push_back(NetworkEvent::RequestFailed(request));
1484 }
1485 }
1486
1487 fn on_request(
1489 &mut self,
1490 event: &EventRequestWillBeSent,
1491 interception_id: Option<InterceptionId>,
1492 ) {
1493 let mut redirect_chain = Vec::new();
1494 let mut redirect_location = None;
1495
1496 if let Some(redirect_resp) = &event.redirect_response {
1497 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1498 if is_redirect_status(redirect_resp.status) {
1499 if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1500 if redirect_resp.url != location {
1501 let fixed_location = location.replace(&redirect_resp.url, "");
1502
1503 if !fixed_location.is_empty() {
1504 if let Some(resp) = request.response.as_mut() {
1505 resp.headers.0["Location"] =
1506 serde_json::Value::String(fixed_location.clone());
1507 }
1508 }
1509
1510 redirect_location = Some(fixed_location);
1511 }
1512 }
1513 }
1514
1515 {
1516 let mut redirect_resp = redirect_resp.clone();
1517
1518 if let Some(redirect_location) = redirect_location {
1519 if !redirect_location.is_empty() {
1520 redirect_resp.headers.0["Location"] =
1521 serde_json::Value::String(redirect_location);
1522 }
1523 }
1524
1525 self.handle_request_redirect(&mut request, redirect_resp);
1526 }
1527
1528 redirect_chain = std::mem::take(&mut request.redirect_chain);
1529 redirect_chain.push(request);
1530 }
1531 }
1532
1533 let request = HttpRequest::new(
1534 event.request_id.clone(),
1535 event.frame_id.clone(),
1536 interception_id,
1537 self.user_request_interception_enabled,
1538 redirect_chain,
1539 );
1540
1541 let rid = event.request_id.clone();
1542 self.queued_events
1543 .push_back(NetworkEvent::Request(rid.clone()));
1544 self.requests.insert(rid, request);
1545 }
1546
1547 fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1549 request.set_response(response);
1550 if let Some(interception_id) = request.interception_id.as_ref() {
1551 self.attempted_authentications
1552 .remove(interception_id.as_ref());
1553 }
1554 }
1555}
1556
1557#[derive(Debug)]
1558pub enum NetworkEvent {
1559 SendCdpRequest((MethodId, serde_json::Value)),
1561 Request(RequestId),
1563 Response(RequestId),
1565 RequestFailed(HttpRequest),
1567 RequestFinished(HttpRequest),
1569 BytesConsumed(u64),
1571}
1572
1573#[cfg(test)]
1574mod tests {
1575 use super::ALLOWED_MATCHER_3RD_PARTY;
1576 use crate::handler::network::NetworkManager;
1577 use std::time::Duration;
1578
1579 #[test]
1580 fn test_allowed_matcher_3rd_party() {
1581 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1583 assert!(
1584 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1585 "expected Cloudflare challenge script to be allowed"
1586 );
1587
1588 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1590 assert!(
1591 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1592 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1593 );
1594
1595 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1597 assert!(ALLOWED_MATCHER_3RD_PARTY
1598 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1599 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1600 }
1601
1602 #[test]
1603 fn test_script_allowed_by_default_when_not_blocklisted() {
1604 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1605 nm.set_page_url(
1606 "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1607 );
1608
1609 let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1611 assert!(
1612 !nm.should_block_script_blocklist_only(ok),
1613 "expected non-blocklisted script to be allowed"
1614 );
1615 }
1616
1617 #[test]
1618 fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1619 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1620 nm.set_page_url(
1621 "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1622 );
1623
1624 let bad = "https://cdn.example.net/js/analytics.js";
1626 assert!(
1627 nm.should_block_script_blocklist_only(bad),
1628 "expected analytics.js to be blocklisted"
1629 );
1630 }
1631
1632 #[test]
1633 fn test_allowed_matcher_3rd_party_sanity() {
1634 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1636 assert!(
1637 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1638 "expected Cloudflare challenge script to be allowed"
1639 );
1640
1641 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1643 assert!(
1644 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1645 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1646 );
1647
1648 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1649 assert!(ALLOWED_MATCHER_3RD_PARTY
1650 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1651 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1652 }
1653 #[test]
1654 fn test_dynamic_blacklist_blocks_url() {
1655 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1656 nm.set_page_url("https://example.com/".to_string());
1657
1658 nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1659 assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1660 assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1661
1662 assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1663 }
1664
1665 #[test]
1666 fn test_blacklist_strict_wins_over_whitelist() {
1667 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1668 nm.set_page_url("https://example.com/".to_string());
1669
1670 nm.set_blacklist_patterns(["beacon.min.js"]);
1672 nm.set_whitelist_patterns(["beacon.min.js"]);
1673
1674 nm.set_blacklist_strict(true);
1675
1676 let u = "https://static.cloudflareinsights.com/beacon.min.js";
1677 assert!(nm.is_whitelisted(u));
1678 assert!(nm.is_blacklisted(u));
1679
1680 assert!(nm.blacklist_strict);
1683 }
1684
1685 #[cfg(feature = "adblock")]
1686 fn make_request_paused(
1687 url: &str,
1688 resource_type: chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType,
1689 is_same_site: bool,
1690 ) -> chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused {
1691 use chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused;
1692 use chromiumoxide_cdp::cdp::browser_protocol::network::{
1693 Headers, Request, RequestReferrerPolicy, ResourcePriority,
1694 };
1695
1696 EventRequestPaused {
1697 request_id: chromiumoxide_cdp::cdp::browser_protocol::network::RequestId::from(
1698 "test-req".to_string(),
1699 )
1700 .into(),
1701 request: Request {
1702 url: url.to_string(),
1703 method: "GET".to_string(),
1704 headers: Headers::new(serde_json::Value::Object(Default::default())),
1705 initial_priority: ResourcePriority::Medium,
1706 referrer_policy: RequestReferrerPolicy::NoReferrer,
1707 url_fragment: None,
1708 has_post_data: None,
1709 post_data_entries: None,
1710 mixed_content_type: None,
1711 is_link_preload: None,
1712 trust_token_params: None,
1713 is_same_site: Some(is_same_site),
1714 is_ad_related: None,
1715 },
1716 frame_id: chromiumoxide_cdp::cdp::browser_protocol::page::FrameId::from(
1717 "frame1".to_string(),
1718 ),
1719 resource_type,
1720 response_error_reason: None,
1721 response_status_code: None,
1722 response_status_text: None,
1723 response_headers: None,
1724 network_id: None,
1725 redirected_request_id: None,
1726 }
1727 }
1728
1729 #[cfg(feature = "adblock")]
1730 #[test]
1731 fn test_detect_ad_blocks_known_tracker_scripts() {
1732 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1733
1734 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1735 nm.set_page_url("https://www.wine-searcher.com/".to_string());
1736
1737 let event = make_request_paused(
1738 "https://www.googletagmanager.com/gtm.js?id=GTM-XXXX",
1739 ResourceType::Script,
1740 false,
1741 );
1742
1743 assert!(
1744 nm.detect_ad(&event),
1745 "googletagmanager.com script should be detected as ad"
1746 );
1747 }
1748
1749 #[cfg(feature = "adblock")]
1750 #[test]
1751 fn test_detect_ad_allows_legitimate_scripts() {
1752 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1753
1754 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1755 nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
1756
1757 let event = make_request_paused(
1758 "https://www.mylegitsite-test.com/static/js/app-bundle.js",
1759 ResourceType::Script,
1760 true,
1761 );
1762
1763 assert!(
1764 !nm.detect_ad(&event),
1765 "legitimate first-party app bundle should not be blocked"
1766 );
1767 }
1768
1769 #[cfg(feature = "adblock")]
1770 #[test]
1771 fn test_detect_ad_uses_source_domain() {
1772 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1773
1774 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1775 nm.set_page_url("https://www.wine-searcher.com/some-page".to_string());
1776
1777 assert!(
1778 !nm.document_target_domain.is_empty(),
1779 "document_target_domain should be set after set_page_url"
1780 );
1781
1782 let event = make_request_paused(
1783 "https://www.google-analytics.com/analytics.js",
1784 ResourceType::Script,
1785 false,
1786 );
1787
1788 assert!(
1789 nm.detect_ad(&event),
1790 "google-analytics.com should be blocked as tracker"
1791 );
1792 }
1793
1794 #[cfg(feature = "adblock")]
1795 #[test]
1796 fn test_custom_adblock_engine_takes_precedence() {
1797 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1798
1799 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1800 nm.set_page_url("https://example.com/".to_string());
1801
1802 let mut filter_set = adblock::lists::FilterSet::new(false);
1804 let mut opts = adblock::lists::ParseOptions::default();
1805 opts.rule_types = adblock::lists::RuleTypes::All;
1806 filter_set.add_filters(["||custom-tracker.example.net^"], opts);
1807 let engine = adblock::Engine::from_filter_set(filter_set, true);
1808 nm.set_adblock_engine(std::sync::Arc::new(engine));
1809
1810 let event = make_request_paused(
1811 "https://custom-tracker.example.net/pixel.js",
1812 ResourceType::Script,
1813 false,
1814 );
1815
1816 assert!(
1817 nm.detect_ad(&event),
1818 "custom engine rule should block custom-tracker.example.net"
1819 );
1820 }
1821
1822 #[cfg(feature = "adblock")]
1825 fn run_full_interception(
1826 nm: &mut NetworkManager,
1827 url: &str,
1828 resource_type: chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType,
1829 is_same_site: bool,
1830 ) -> bool {
1831 use super::NetworkEvent;
1832
1833 while nm.poll().is_some() {}
1835
1836 let event = make_request_paused(url, resource_type, is_same_site);
1837 nm.on_fetch_request_paused(&event);
1838
1839 let mut blocked = false;
1841 while let Some(ev) = nm.poll() {
1842 if let NetworkEvent::SendCdpRequest((method, _)) = &ev {
1843 let m: &str = method.as_ref();
1844 if m == "Fetch.fulfillRequest" || m == "Fetch.failRequest" {
1845 blocked = true;
1846 }
1847 }
1848 }
1849 blocked
1850 }
1851
1852 #[cfg(feature = "adblock")]
1855 #[test]
1856 fn test_e2e_tracker_script_blocked() {
1857 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1858
1859 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1860 nm.set_page_url("https://www.wine-searcher.com/".to_string());
1861
1862 assert!(
1863 run_full_interception(
1864 &mut nm,
1865 "https://www.googletagmanager.com/gtm.js?id=GTM-XXXX",
1866 ResourceType::Script,
1867 false,
1868 ),
1869 "GTM script should be blocked through full pipeline"
1870 );
1871 }
1872
1873 #[cfg(feature = "adblock")]
1874 #[test]
1875 fn test_e2e_legitimate_script_allowed() {
1876 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1877
1878 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1879 nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
1880
1881 assert!(
1882 !run_full_interception(
1883 &mut nm,
1884 "https://www.mylegitsite-test.com/static/js/app-bundle.js",
1885 ResourceType::Script,
1886 true,
1887 ),
1888 "legitimate first-party script should be allowed through full pipeline"
1889 );
1890 }
1891
1892 #[cfg(feature = "adblock")]
1893 #[test]
1894 fn test_e2e_analytics_xhr_blocked() {
1895 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1896
1897 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1898 nm.set_page_url("https://example.org/".to_string());
1899
1900 assert!(
1901 run_full_interception(
1902 &mut nm,
1903 "https://www.google-analytics.com/g/collect?v=2&tid=UA-123",
1904 ResourceType::Xhr,
1905 false,
1906 ),
1907 "Google Analytics XHR should be blocked through full pipeline"
1908 );
1909 }
1910
1911 #[cfg(feature = "adblock")]
1912 #[test]
1913 fn test_e2e_whitelisted_overrides_adblock() {
1914 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1915
1916 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1917 nm.set_page_url("https://example.org/".to_string());
1918 nm.set_whitelist_patterns(["googletagmanager.com"]);
1919
1920 assert!(
1922 !run_full_interception(
1923 &mut nm,
1924 "https://www.googletagmanager.com/gtm.js?id=GTM-TEST",
1925 ResourceType::Script,
1926 false,
1927 ),
1928 "whitelisted tracker should be allowed even when adblock would block it"
1929 );
1930 }
1931
1932 #[cfg(feature = "adblock")]
1933 #[test]
1934 fn test_e2e_blacklist_strict_overrides_whitelist() {
1935 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1936
1937 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1938 nm.set_page_url("https://example.org/".to_string());
1939 nm.set_blacklist_patterns(["cdn.example.net/evil.js"]);
1940 nm.set_whitelist_patterns(["cdn.example.net/evil.js"]);
1941 nm.set_blacklist_strict(true);
1942
1943 assert!(
1944 run_full_interception(
1945 &mut nm,
1946 "https://cdn.example.net/evil.js",
1947 ResourceType::Script,
1948 false,
1949 ),
1950 "strict blacklist should win over whitelist"
1951 );
1952 }
1953
1954 #[cfg(feature = "adblock")]
1955 #[test]
1956 fn test_e2e_first_party_document_not_blocked() {
1957 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1958
1959 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1960 nm.set_page_url("https://www.nytimes.com/".to_string());
1961
1962 assert!(
1963 !run_full_interception(
1964 &mut nm,
1965 "https://www.nytimes.com/2024/article.html",
1966 ResourceType::Document,
1967 true,
1968 ),
1969 "first-party document navigation should never be blocked"
1970 );
1971 }
1972
1973 #[cfg(feature = "adblock")]
1974 #[test]
1975 fn test_e2e_custom_engine_blocks_through_pipeline() {
1976 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1977
1978 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1979 nm.set_page_url("https://mysite.com/".to_string());
1980
1981 let mut filter_set = adblock::lists::FilterSet::new(false);
1982 let mut opts = adblock::lists::ParseOptions::default();
1983 opts.rule_types = adblock::lists::RuleTypes::All;
1984 filter_set.add_filters(["||evil-cdn.example.net^$script"], opts);
1985 let engine = adblock::Engine::from_filter_set(filter_set, true);
1986 nm.set_adblock_engine(std::sync::Arc::new(engine));
1987
1988 assert!(
1989 run_full_interception(
1990 &mut nm,
1991 "https://evil-cdn.example.net/tracker.js",
1992 ResourceType::Script,
1993 false,
1994 ),
1995 "custom engine rule should block through full pipeline"
1996 );
1997
1998 assert!(
2000 !run_full_interception(
2001 &mut nm,
2002 "https://mysite.com/app.js",
2003 ResourceType::Script,
2004 true,
2005 ),
2006 "first-party script should still be allowed with custom engine"
2007 );
2008 }
2009
2010 #[cfg(feature = "adblock")]
2011 #[test]
2012 fn test_e2e_ad_image_blocked() {
2013 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2014
2015 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2016 nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
2017
2018 assert!(
2020 run_full_interception(
2021 &mut nm,
2022 "https://googleads.g.doubleclick.net/pagead/viewthroughconversion/123/?random=456",
2023 ResourceType::Image,
2024 false,
2025 ),
2026 "doubleclick ad image/tracking pixel should be blocked"
2027 );
2028
2029 assert!(
2031 !run_full_interception(
2032 &mut nm,
2033 "https://www.mylegitsite-test.com/images/logo.png",
2034 ResourceType::Image,
2035 true,
2036 ),
2037 "legitimate first-party image should not be blocked"
2038 );
2039 }
2040
2041 #[cfg(feature = "adblock")]
2042 #[test]
2043 fn test_e2e_hostname_with_userinfo() {
2044 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2045
2046 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2047 nm.set_page_url("https://example.org/".to_string());
2048
2049 assert!(
2051 run_full_interception(
2052 &mut nm,
2053 "https://user:pass@www.googletagmanager.com/gtm.js?id=GTM-XXXX",
2054 ResourceType::Script,
2055 false,
2056 ),
2057 "tracker URL with userinfo should still be blocked"
2058 );
2059 }
2060
2061 #[test]
2062 fn test_blacklist_non_strict_allows_whitelist_override() {
2063 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2064 nm.set_page_url("https://example.com/".to_string());
2065
2066 nm.set_blacklist_patterns(["beacon.min.js"]);
2067 nm.set_whitelist_patterns(["beacon.min.js"]);
2068
2069 nm.set_blacklist_strict(false);
2070
2071 let u = "https://static.cloudflareinsights.com/beacon.min.js";
2072 assert!(nm.is_blacklisted(u));
2073 assert!(nm.is_whitelisted(u));
2074 assert!(!nm.blacklist_strict);
2075 }
2076}