1#[cfg(any(feature = "adblock", feature = "firewall"))]
2use super::blockers::block_websites::block_ads;
3use super::blockers::{
4 block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
5 xhr::IGNORE_XHR_ASSETS,
6};
7use crate::auth::Credentials;
8#[cfg(feature = "_cache")]
9use crate::cache::BasicCachePolicy;
10use crate::cmd::CommandChain;
11use crate::handler::http::HttpRequest;
12use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
13use aho_corasick::AhoCorasick;
14use case_insensitive_string::CaseInsensitiveString;
15use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
16use chromiumoxide_cdp::cdp::browser_protocol::network::{
17 EmulateNetworkConditionsByRuleParams, EventLoadingFailed, EventLoadingFinished,
18 EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
19 InterceptionId, NetworkConditions, RequestId, ResourceType, Response, SetCacheDisabledParams,
20 SetExtraHttpHeadersParams,
21};
22use chromiumoxide_cdp::cdp::browser_protocol::{
23 fetch::{
24 self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
25 ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
26 },
27 network::SetBypassServiceWorkerParams,
28};
29use chromiumoxide_cdp::cdp::browser_protocol::{
30 network::EnableParams, security::SetIgnoreCertificateErrorsParams,
31};
32use chromiumoxide_types::{Command, Method, MethodId};
33use hashbrown::{HashMap, HashSet};
34use lazy_static::lazy_static;
35use reqwest::header::PROXY_AUTHORIZATION;
36use spider_network_blocker::intercept_manager::NetworkInterceptManager;
37pub use spider_network_blocker::scripts::{
38 URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
39};
40use std::borrow::Cow;
41use std::collections::VecDeque;
42use std::time::{Duration, Instant};
43
44lazy_static! {
45 static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
47 "jquery", "angular",
49 "react", "vue", "bootstrap",
52 "d3",
53 "lodash",
54 "ajax",
55 "application",
56 "app", "main",
58 "index",
59 "bundle",
60 "vendor",
61 "runtime",
62 "polyfill",
63 "scripts",
64 "es2015.",
65 "es2020.",
66 "webpack",
67 "captcha",
68 "client",
69 "/cdn-cgi/challenge-platform/",
70 "/wp-content/js/", "https://m.stripe.network/",
73 "https://challenges.cloudflare.com/",
74 "https://www.google.com/recaptcha/",
75 "https://google.com/recaptcha/api.js",
76 "https://www.gstatic.com/recaptcha/",
77 "https://captcha.px-cloud.net/",
78 "https://geo.captcha-delivery.com/",
79 "https://api.leminnow.com/captcha/",
80 "https://cdn.auth0.com/js/lock/",
81 "https://captcha.gtimg.com",
82 "https://client-api.arkoselabs.com/",
83 "https://www.capy.me/puzzle/",
84 "https://newassets.hcaptcha.com/",
85 "https://cdn.auth0.com/client",
86 "https://js.stripe.com/",
87 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-"
90 ];
91
92 pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
97
98 static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
100 "https://m.stripe.network/",
102 "https://challenges.cloudflare.com/",
103 "https://js.stripe.com/",
104 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-",
107 "https://ct.captcha-delivery.com/",
108 "https://geo.captcha-delivery.com/",
109 "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", "https://cdn.auth0.com/client",
111 "https://captcha.px-cloud.net/",
112 "https://www.capy.me/puzzle/",
113 "https://www.gstatic.com/recaptcha/",
114 "https://google.com/recaptcha/",
115 "https://www.google.com/recaptcha/",
116 "https://www.recaptcha.net/recaptcha/",
117 "https://js.hcaptcha.com/1/api.js",
118 "https://hcaptcha.com/1/api.js",
119 "https://js.datadome.co/tags.js",
120 "https://api-js.datadome.co/",
121 "https://client.perimeterx.net/",
122 "https://captcha.px-cdn.net/",
123 "https://newassets.hcaptcha.com/",
124 "https://captcha.px-cloud.net/",
125 "https://s.perimeterx.net/",
126 "https://api.leminnow.com/captcha/",
127 "https://client-api.arkoselabs.com/",
128 "https://static.geetest.com/v4/gt4.js",
129 "https://static.geetest.com/",
130 "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
131 "https://cdn.perfdrive.com/aperture/",
132 "https://assets.queue-it.net/",
133 "discourse-cdn.com/",
134 "hcaptcha.com",
135 "/cdn-cgi/challenge-platform/",
136 "/_Incapsula_Resource"
137 ];
138
139 pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
141
142 pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
144 phf::phf_set! {
145 "_astro/", "_app/immutable"
147 }
148 };
149
150 pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
152 "application/pdf",
153 "application/zip",
154 "application/x-rar-compressed",
155 "application/x-tar",
156 "image/png",
157 "image/jpeg",
158 "image/gif",
159 "image/bmp",
160 "image/webp",
161 "image/svg+xml",
162 "video/mp4",
163 "video/x-msvideo",
164 "video/x-matroska",
165 "video/webm",
166 "audio/mpeg",
167 "audio/ogg",
168 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
169 "application/vnd.ms-excel",
170 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
171 "application/vnd.ms-powerpoint",
172 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
173 "application/x-7z-compressed",
174 "application/x-rpm",
175 "application/x-shockwave-flash",
176 "application/rtf",
177 };
178
179 pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
181 "Image",
182 "Media",
183 "Font"
184 };
185
186 pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
188 "CspViolationReport",
189 "Ping",
190 };
191
192 pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
194
195 pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
197 let enable = EnableParams::default();
198
199 if let Ok(c) = serde_json::to_value(&enable) {
200 vec![(enable.identifier(), c)]
201 } else {
202 vec![]
203 }
204 };
205
206 pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
208 let enable = EnableParams::default();
209 let mut v = vec![];
210 if let Ok(c) = serde_json::to_value(&enable) {
211 v.push((enable.identifier(), c));
212 }
213 let ignore = SetIgnoreCertificateErrorsParams::new(true);
214 if let Ok(ignored) = serde_json::to_value(&ignore) {
215 v.push((ignore.identifier(), ignored));
216 }
217
218 v
219 };
220
221 pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
223 fetch::EnableParams::builder()
224 .handle_auth_requests(true)
225 .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
226 .build()
227 };
228}
229
230pub(crate) fn is_redirect_status(status: i64) -> bool {
232 matches!(status, 301 | 302 | 303 | 307 | 308)
233}
234
235const STALE_BUFFER_SECS: u64 = 30;
240
241#[cfg(feature = "adblock")]
243pub struct AdblockEngine(std::sync::Arc<adblock::Engine>);
244
245#[cfg(feature = "adblock")]
246impl std::fmt::Debug for AdblockEngine {
247 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
248 f.debug_struct("AdblockEngine").finish()
249 }
250}
251
252#[cfg(feature = "adblock")]
253impl std::ops::Deref for AdblockEngine {
254 type Target = adblock::Engine;
255 fn deref(&self) -> &Self::Target {
256 &self.0
257 }
258}
259
260#[derive(Debug)]
261pub struct NetworkManager {
263 queued_events: VecDeque<NetworkEvent>,
269 ignore_httpserrors: bool,
274 requests: HashMap<RequestId, HttpRequest>,
279 requests_will_be_sent: HashMap<RequestId, (EventRequestWillBeSent, Instant)>,
286 extra_headers: std::collections::HashMap<String, String>,
291 request_id_to_interception_id: HashMap<RequestId, (InterceptionId, Instant)>,
298 user_cache_disabled: bool,
303 attempted_authentications: HashSet<RequestId>,
309 credentials: Option<Credentials>,
314 pub(crate) user_request_interception_enabled: bool,
323 block_all: bool,
330 pub(crate) protocol_request_interception_enabled: bool,
336 offline: bool,
338 pub request_timeout: Duration,
340 pub ignore_visuals: bool,
343 pub block_stylesheets: bool,
345 pub block_javascript: bool,
350 pub block_analytics: bool,
352 pub block_prefetch: bool,
354 pub only_html: bool,
356 pub xml_document: bool,
358 pub intercept_manager: NetworkInterceptManager,
360 pub document_reload_tracker: u8,
362 pub document_target_url: String,
364 pub document_target_domain: String,
366 pub max_bytes_allowed: Option<u64>,
368 #[cfg(feature = "_cache")]
369 pub cache_site_key: Option<String>,
371 #[cfg(feature = "_cache")]
373 pub cache_policy: Option<BasicCachePolicy>,
374 whitelist_patterns: Vec<String>,
376 whitelist_matcher: Option<AhoCorasick>,
378 blacklist_patterns: Vec<String>,
380 blacklist_matcher: Option<AhoCorasick>,
382 blacklist_strict: bool,
384 #[cfg(feature = "adblock")]
387 adblock_engine: Option<AdblockEngine>,
388}
389
390impl NetworkManager {
391 pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
393 Self {
394 queued_events: Default::default(),
395 ignore_httpserrors,
396 requests: Default::default(),
397 requests_will_be_sent: Default::default(),
398 extra_headers: Default::default(),
399 request_id_to_interception_id: Default::default(),
400 user_cache_disabled: false,
401 attempted_authentications: Default::default(),
402 credentials: None,
403 block_all: false,
404 user_request_interception_enabled: false,
405 protocol_request_interception_enabled: false,
406 offline: false,
407 request_timeout,
408 ignore_visuals: false,
409 block_javascript: false,
410 block_stylesheets: false,
411 block_prefetch: true,
412 block_analytics: true,
413 only_html: false,
414 xml_document: false,
415 intercept_manager: NetworkInterceptManager::Unknown,
416 document_reload_tracker: 0,
417 document_target_url: String::new(),
418 document_target_domain: String::new(),
419 whitelist_patterns: Vec::new(),
420 whitelist_matcher: None,
421 blacklist_patterns: Vec::new(),
422 blacklist_matcher: None,
423 blacklist_strict: true,
424 max_bytes_allowed: None,
425 #[cfg(feature = "_cache")]
426 cache_site_key: None,
427 #[cfg(feature = "_cache")]
428 cache_policy: None,
429 #[cfg(feature = "adblock")]
430 adblock_engine: None,
431 }
432 }
433
434 #[cfg(feature = "adblock")]
436 pub fn set_adblock_engine(&mut self, engine: std::sync::Arc<adblock::Engine>) {
437 self.adblock_engine = Some(AdblockEngine(engine));
438 }
439
440 pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
442 where
443 I: IntoIterator<Item = S>,
444 S: Into<String>,
445 {
446 self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
447 self.rebuild_whitelist_matcher();
448 }
449
450 pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
452 where
453 I: IntoIterator<Item = S>,
454 S: Into<String>,
455 {
456 self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
457 self.rebuild_blacklist_matcher();
458 }
459
460 pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
462 self.blacklist_patterns.push(pattern.into());
463 self.rebuild_blacklist_matcher();
464 }
465
466 pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
468 where
469 I: IntoIterator<Item = S>,
470 S: Into<String>,
471 {
472 self.blacklist_patterns
473 .extend(patterns.into_iter().map(Into::into));
474 self.rebuild_blacklist_matcher();
475 }
476
477 pub fn clear_blacklist(&mut self) {
479 self.blacklist_patterns.clear();
480 self.blacklist_matcher = None;
481 }
482
483 pub fn set_blacklist_strict(&mut self, strict: bool) {
485 self.blacklist_strict = strict;
486 }
487
488 #[inline]
489 fn rebuild_blacklist_matcher(&mut self) {
490 if self.blacklist_patterns.is_empty() {
491 self.blacklist_matcher = None;
492 return;
493 }
494
495 self.blacklist_matcher =
496 AhoCorasick::new(self.blacklist_patterns.iter().map(|s| s.as_str())).ok();
497 }
498
499 #[inline]
500 fn is_blacklisted(&self, url: &str) -> bool {
501 self.blacklist_matcher
502 .as_ref()
503 .map(|m| m.is_match(url))
504 .unwrap_or(false)
505 }
506
507 pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
509 self.whitelist_patterns.push(pattern.into());
510 self.rebuild_whitelist_matcher();
511 }
512
513 pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
515 where
516 I: IntoIterator<Item = S>,
517 S: Into<String>,
518 {
519 self.whitelist_patterns
520 .extend(patterns.into_iter().map(Into::into));
521 self.rebuild_whitelist_matcher();
522 }
523
524 #[inline]
525 fn rebuild_whitelist_matcher(&mut self) {
526 if self.whitelist_patterns.is_empty() {
527 self.whitelist_matcher = None;
528 return;
529 }
530
531 self.whitelist_matcher =
533 AhoCorasick::new(self.whitelist_patterns.iter().map(|s| s.as_str())).ok();
534 }
535
536 #[inline]
537 fn is_whitelisted(&self, url: &str) -> bool {
538 self.whitelist_matcher
539 .as_ref()
540 .map(|m| m.is_match(url))
541 .unwrap_or(false)
542 }
543
544 pub fn init_commands(&self) -> CommandChain {
546 let cmds = if self.ignore_httpserrors {
547 INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
548 } else {
549 INIT_CHAIN.clone()
550 };
551 CommandChain::new(cmds, self.request_timeout)
552 }
553
554 pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
556 let method = cmd.identifier();
557 if let Ok(params) = serde_json::to_value(cmd) {
558 self.queued_events
559 .push_back(NetworkEvent::SendCdpRequest((method, params)));
560 }
561 }
562
563 pub fn poll(&mut self) -> Option<NetworkEvent> {
565 self.queued_events.pop_front()
566 }
567
568 pub fn evict_stale_entries(&mut self) {
573 let cutoff = Instant::now() - Duration::from_secs(STALE_BUFFER_SECS);
574
575 self.requests_will_be_sent.retain(|_, (_, ts)| *ts > cutoff);
576 self.request_id_to_interception_id
577 .retain(|_, (_, ts)| *ts > cutoff);
578
579 if !self.attempted_authentications.is_empty() {
584 let live: HashSet<&str> = self
585 .requests
586 .values()
587 .filter_map(|r| r.interception_id.as_ref().map(|id| id.as_ref()))
588 .collect();
589 self.attempted_authentications
590 .retain(|id| live.contains(id.as_ref()));
591 }
592 }
593
594 pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
596 &self.extra_headers
597 }
598
599 pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
601 self.extra_headers = headers;
602 self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
603 self.extra_headers.remove("Proxy-Authorization");
604 if !self.extra_headers.is_empty() {
605 if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
606 self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
607 }
608 }
609 }
610
611 pub fn set_service_worker_enabled(&mut self, bypass: bool) {
612 self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
613 }
614
615 pub fn set_block_all(&mut self, block_all: bool) {
616 self.block_all = block_all;
617 }
618
619 pub fn set_request_interception(&mut self, enabled: bool) {
620 self.user_request_interception_enabled = enabled;
621 self.update_protocol_request_interception();
622 }
623
624 pub fn set_cache_enabled(&mut self, enabled: bool) {
625 let run = self.user_cache_disabled == enabled;
626 self.user_cache_disabled = !enabled;
627 if run {
628 self.update_protocol_cache_disabled();
629 }
630 }
631
632 pub fn enable_request_intercept(&mut self) {
634 self.protocol_request_interception_enabled = true;
635 }
636
637 pub fn disable_request_intercept(&mut self) {
639 self.protocol_request_interception_enabled = false;
640 }
641
642 #[cfg(feature = "_cache")]
644 pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
645 self.cache_site_key = cache_site_key;
646 }
647
648 #[cfg(feature = "_cache")]
650 pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
651 self.cache_policy = cache_policy;
652 }
653
654 pub fn update_protocol_cache_disabled(&mut self) {
655 self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
656 }
657
658 pub fn authenticate(&mut self, credentials: Credentials) {
659 self.credentials = Some(credentials);
660 self.update_protocol_request_interception();
661 self.protocol_request_interception_enabled = true;
662 }
663
664 fn update_protocol_request_interception(&mut self) {
665 let enabled = self.user_request_interception_enabled || self.credentials.is_some();
666
667 if enabled == self.protocol_request_interception_enabled {
668 return;
669 }
670
671 if enabled {
672 self.push_cdp_request(ENABLE_FETCH.clone())
673 } else {
674 self.push_cdp_request(DisableParams::default())
675 }
676 }
677
678 #[inline]
681 fn should_block_script_blocklist_only(&self, url: &str) -> bool {
682 let block_analytics = self.block_analytics;
684
685 if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
687 {
688 return true;
689 }
690
691 if crate::handler::blockers::block_websites::block_website(url) {
693 return true;
694 }
695
696 if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
703 let p_slash = Self::strip_query_fragment(path_with_slash);
705 let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
706
707 let base = match p_slash.rsplit('/').next() {
709 Some(b) => b,
710 None => p_slash,
711 };
712
713 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
716 return true;
717 }
718 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
719 return true;
720 }
721 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
722 return true;
723 }
724
725 if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
728 return true;
729 }
730
731 if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
733 return true;
734 }
735 }
736
737 false
738 }
739
740 #[inline]
745 fn url_path_with_leading_slash(url: &str) -> Option<&str> {
746 let idx = url.find("//")?;
748 let after_slashes = idx + 2;
749
750 let slash_rel = url[after_slashes..].find('/')?;
752 let slash_idx = after_slashes + slash_rel;
753
754 if slash_idx < url.len() {
755 Some(&url[slash_idx..])
756 } else {
757 None
758 }
759 }
760
761 #[inline]
766 fn strip_query_fragment(s: &str) -> &str {
767 let q = s.find('?');
768 let h = s.find('#');
769
770 match (q, h) {
771 (None, None) => s,
772 (Some(i), None) => &s[..i],
773 (None, Some(i)) => &s[..i],
774 (Some(i), Some(j)) => &s[..i.min(j)],
775 }
776 }
777
778 #[inline]
780 fn skip_xhr(
781 &self,
782 skip_networking: bool,
783 event: &EventRequestPaused,
784 network_event: bool,
785 ) -> bool {
786 if !skip_networking && network_event {
788 let request_url = event.request.url.as_str();
789
790 let skip_analytics =
792 self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
793
794 if skip_analytics {
795 true
796 } else if self.block_stylesheets || self.ignore_visuals {
797 let block_css = self.block_stylesheets;
798 let block_media = self.ignore_visuals;
799
800 let mut block_request = false;
801
802 if let Some(position) = request_url.rfind('.') {
803 let hlen = request_url.len();
804 let has_asset = hlen - position;
805
806 if has_asset >= 3 {
807 let next_position = position + 1;
808
809 if block_media
810 && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
811 &request_url[next_position..].into(),
812 )
813 {
814 block_request = true;
815 } else if block_css {
816 block_request = CaseInsensitiveString::from(
817 &request_url.as_bytes()[next_position..],
818 )
819 .contains(&**CSS_EXTENSION)
820 }
821 }
822 }
823
824 if !block_request {
825 block_request = ignore_script_xhr_media(request_url);
826 }
827
828 block_request
829 } else {
830 skip_networking
831 }
832 } else {
833 skip_networking
834 }
835 }
836
837 #[cfg(feature = "adblock")]
838 #[inline]
839 fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
841 if skip_networking {
842 true
843 } else {
844 block_ads(&event.request.url) || self.detect_ad(event)
845 }
846 }
847
848 #[cfg(not(feature = "adblock"))]
850 #[inline]
851 fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
852 use crate::handler::blockers::block_websites::block_ads;
853 if skip_networking {
854 true
855 } else {
856 block_ads(&event.request.url)
857 }
858 }
859
860 #[inline]
861 fn fail_request_blocked(
863 &mut self,
864 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
865 ) {
866 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
867 request_id.clone(),
868 chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
869 );
870 self.push_cdp_request(params);
871 }
872
873 #[inline]
874 fn fulfill_request_empty_200(
876 &mut self,
877 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
878 ) {
879 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
880 request_id.clone(),
881 200,
882 );
883 self.push_cdp_request(params);
884 }
885
886 #[cfg(feature = "_cache")]
887 #[inline]
888 fn fulfill_request_from_cache(
892 &mut self,
893 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
894 body: &[u8],
895 headers: &std::collections::HashMap<String, String>,
896 status: i64,
897 ) {
898 use crate::cdp::browser_protocol::fetch::HeaderEntry;
899 use crate::handler::network::fetch::FulfillRequestParams;
900 use base64::Engine;
901
902 let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
903
904 for (k, v) in headers.iter() {
905 resp_headers.push(HeaderEntry {
906 name: k.clone().into(),
907 value: v.clone().into(),
908 });
909 }
910
911 let mut params = FulfillRequestParams::new(request_id.clone(), status);
912
913 params.body = Some(
915 base64::engine::general_purpose::STANDARD
916 .encode(body)
917 .into(),
918 );
919
920 params.response_headers = Some(resp_headers);
921
922 self.push_cdp_request(params);
923 }
924
925 #[inline]
926 fn continue_request_with_url(
928 &mut self,
929 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
930 url: Option<&str>,
931 intercept_response: bool,
932 ) {
933 let mut params = ContinueRequestParams::new(request_id.clone());
934 if let Some(url) = url {
935 params.url = Some(url.to_string());
936 params.intercept_response = Some(intercept_response);
937 }
938 self.push_cdp_request(params);
939 }
940
941 #[inline]
943 pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
944 if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
945 return;
946 }
947
948 if self.block_all {
949 tracing::debug!(
950 "Blocked (block_all): {:?} - {}",
951 event.resource_type,
952 event.request.url
953 );
954 return self.fail_request_blocked(&event.request_id);
955 }
956
957 if let Some(network_id) = event.network_id.as_ref() {
958 if let Some((request_will_be_sent, _)) =
959 self.requests_will_be_sent.remove(network_id.as_ref())
960 {
961 self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
962 } else {
963 self.request_id_to_interception_id.insert(
964 network_id.clone(),
965 (event.request_id.clone().into(), Instant::now()),
966 );
967 }
968 }
969
970 let javascript_resource = event.resource_type == ResourceType::Script;
972 let document_resource = event.resource_type == ResourceType::Document;
973 let network_resource =
974 !document_resource && crate::utils::is_data_resource(&event.resource_type);
975
976 let mut skip_networking =
978 self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(event.resource_type.as_ref());
979
980 if event.resource_type == ResourceType::Prefetch && !self.block_prefetch {
981 skip_networking = true;
982 }
983
984 if !skip_networking {
986 skip_networking = self.document_reload_tracker >= 3;
987 }
988
989 let (current_url_cow, had_replacer) =
991 self.handle_document_replacement_and_tracking(event, document_resource);
992
993 let current_url: &str = current_url_cow.as_ref();
994
995 let blacklisted = self.is_blacklisted(current_url);
996
997 if !self.blacklist_strict && blacklisted {
998 skip_networking = true;
999 }
1000
1001 if !skip_networking {
1002 if self.xml_document && current_url.ends_with(".xsl") {
1004 skip_networking = false;
1005 } else {
1006 skip_networking = self.should_skip_for_visuals_and_basic(&event.resource_type);
1007 }
1008 }
1009
1010 skip_networking = self.detect_ad_if_enabled(event, skip_networking);
1011
1012 if !skip_networking
1014 && self.block_javascript
1015 && (self.only_html || self.ignore_visuals)
1016 && (javascript_resource
1017 || document_resource
1018 || event.resource_type == ResourceType::Stylesheet
1019 || event.resource_type == ResourceType::Image)
1020 {
1021 skip_networking = ignore_script_embedded(current_url);
1022 }
1023
1024 if !skip_networking && javascript_resource {
1027 skip_networking = self.should_block_script_blocklist_only(current_url);
1028 }
1029
1030 skip_networking = self.skip_xhr(skip_networking, event, network_resource);
1032
1033 if !skip_networking && (javascript_resource || network_resource || document_resource) {
1035 skip_networking = self.intercept_manager.intercept_detection(
1036 current_url,
1037 self.ignore_visuals,
1038 network_resource,
1039 );
1040 }
1041
1042 if !skip_networking && (javascript_resource || network_resource) {
1044 skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
1045 }
1046
1047 if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
1050 {
1051 skip_networking = false;
1052 }
1053
1054 if skip_networking && self.is_whitelisted(current_url) {
1056 skip_networking = false;
1057 }
1058
1059 if self.blacklist_strict && blacklisted {
1060 skip_networking = true;
1061 }
1062
1063 if skip_networking {
1064 tracing::debug!("Blocked: {:?} - {}", event.resource_type, current_url);
1065 self.fulfill_request_empty_200(&event.request_id);
1066 } else {
1067 #[cfg(feature = "_cache")]
1068 {
1069 if let (Some(policy), Some(cache_site_key)) =
1070 (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
1071 {
1072 let current_url = format!("{}:{}", event.request.method, ¤t_url);
1073
1074 if let Some((res, cache_policy)) =
1075 crate::cache::remote::get_session_cache_item(cache_site_key, ¤t_url)
1076 {
1077 if policy.allows_cached(&cache_policy) {
1078 tracing::debug!(
1079 "Remote Cached: {:?} - {}",
1080 &event.resource_type,
1081 ¤t_url
1082 );
1083 let flat_headers = crate::http::headers_from_multi(&res.headers);
1084 return self.fulfill_request_from_cache(
1085 &event.request_id,
1086 &res.body,
1087 &flat_headers,
1088 res.status as i64,
1089 );
1090 }
1091 }
1092 }
1093 }
1094
1095 tracing::debug!("Allowed: {:?} - {}", event.resource_type, current_url);
1097 self.continue_request_with_url(
1098 &event.request_id,
1099 if had_replacer {
1100 Some(current_url)
1101 } else {
1102 None
1103 },
1104 !had_replacer,
1105 );
1106 }
1107 }
1108
1109 #[inline]
1115 fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1116 (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1117 || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1118 }
1119
1120 pub fn has_target_domain(&self) -> bool {
1122 !self.document_target_url.is_empty()
1123 }
1124
1125 pub fn set_page_url(&mut self, page_target_url: String) {
1127 let host_base = host_and_rest(&page_target_url)
1128 .map(|(h, _)| base_domain_from_host(h))
1129 .unwrap_or("");
1130
1131 self.document_target_domain = host_base.to_string();
1132 self.document_target_url = page_target_url;
1133 }
1134
1135 pub fn clear_target_domain(&mut self) {
1137 self.document_reload_tracker = 0;
1138 self.document_target_url = Default::default();
1139 self.document_target_domain = Default::default();
1140 }
1141
1142 #[inline]
1150 fn handle_document_replacement_and_tracking<'a>(
1151 &mut self,
1152 event: &'a EventRequestPaused,
1153 document_resource: bool,
1154 ) -> (Cow<'a, str>, bool) {
1155 let mut replacer: Option<String> = None;
1156 let current_url = event.request.url.as_str();
1157
1158 if document_resource {
1159 if self.document_target_url == current_url {
1160 self.document_reload_tracker += 1;
1161 } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1162 {
1163 let (http_document_replacement, mut https_document_replacement) =
1164 if self.document_target_url.starts_with("http://") {
1165 (
1166 self.document_target_url.replacen("http://", "http//", 1),
1167 self.document_target_url.replacen("http://", "https://", 1),
1168 )
1169 } else {
1170 (
1171 self.document_target_url.replacen("https://", "https//", 1),
1172 self.document_target_url.replacen("https://", "http://", 1),
1173 )
1174 };
1175
1176 let trailing = https_document_replacement.ends_with('/');
1178 if trailing {
1179 https_document_replacement.pop();
1180 }
1181 if https_document_replacement.ends_with('/') {
1182 https_document_replacement.pop();
1183 }
1184
1185 let redirect_mask = format!(
1186 "{}{}",
1187 https_document_replacement, http_document_replacement
1188 );
1189
1190 if current_url == redirect_mask {
1191 replacer = Some(if trailing {
1192 format!("{}/", https_document_replacement)
1193 } else {
1194 https_document_replacement
1195 });
1196 }
1197 }
1198
1199 if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1200 self.xml_document = true;
1201 }
1202
1203 self.document_target_url = event.request.url.clone();
1205 self.document_target_domain = host_and_rest(&self.document_target_url)
1206 .map(|(h, _)| base_domain_from_host(h).to_string())
1207 .unwrap_or_default();
1208 }
1209
1210 let current_url_cow = match replacer {
1211 Some(r) => Cow::Owned(r),
1212 None => Cow::Borrowed(event.request.url.as_str()),
1213 };
1214
1215 let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1216 (current_url_cow, had_replacer)
1217 }
1218
1219 #[cfg(feature = "adblock")]
1223 pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1224 use adblock::{
1225 lists::{FilterSet, ParseOptions, RuleTypes},
1226 Engine,
1227 };
1228
1229 lazy_static::lazy_static! {
1230 static ref AD_ENGINE: Engine = {
1231 let mut filter_set = FilterSet::new(false);
1232 let mut rules = ParseOptions::default();
1233 rules.rule_types = RuleTypes::All;
1234
1235 filter_set.add_filters(
1236 &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1237 rules.clone(),
1238 );
1239
1240 #[cfg(feature = "adblock_easylist")]
1243 {
1244 static EASYLIST: &str = include_str!(concat!(env!("OUT_DIR"), "/easylist.txt"));
1245 static EASYPRIVACY: &str = include_str!(concat!(env!("OUT_DIR"), "/easyprivacy.txt"));
1246
1247 if !EASYLIST.is_empty() {
1248 filter_set.add_filter_list(EASYLIST, rules.clone());
1249 }
1250 if !EASYPRIVACY.is_empty() {
1251 filter_set.add_filter_list(EASYPRIVACY, rules);
1252 }
1253 }
1254
1255 Engine::from_filter_set(filter_set, true)
1256 };
1257 }
1258
1259 let blockable = event.resource_type == ResourceType::Script
1260 || event.resource_type == ResourceType::Image
1261 || event.resource_type == ResourceType::Media
1262 || event.resource_type == ResourceType::Stylesheet
1263 || event.resource_type == ResourceType::Document
1264 || event.resource_type == ResourceType::Fetch
1265 || event.resource_type == ResourceType::Xhr;
1266
1267 if !blockable {
1268 return false;
1269 }
1270
1271 let u = &event.request.url;
1272
1273 let source_domain = if self.document_target_domain.is_empty() {
1274 "example.com"
1275 } else {
1276 &self.document_target_domain
1277 };
1278
1279 let hostname = u
1282 .strip_prefix("https://")
1283 .or_else(|| u.strip_prefix("http://"))
1284 .and_then(|rest| rest.split('/').next())
1285 .map(|authority| match authority.rfind('@') {
1287 Some(i) => &authority[i + 1..],
1288 None => authority,
1289 })
1290 .and_then(|host_port| host_port.split(':').next())
1292 .unwrap_or(source_domain);
1293
1294 let resource_type_str = match event.resource_type {
1295 ResourceType::Script => "script",
1296 ResourceType::Image => "image",
1297 ResourceType::Media => "media",
1298 ResourceType::Stylesheet => "stylesheet",
1299 ResourceType::Document => "document",
1300 ResourceType::Fetch => "fetch",
1301 ResourceType::Xhr => "xhr",
1302 _ => "other",
1303 };
1304
1305 let request = adblock::request::Request::preparsed(
1306 u,
1307 hostname,
1308 source_domain,
1309 resource_type_str,
1310 !event.request.is_same_site.unwrap_or_default(),
1311 );
1312
1313 let engine: &Engine = match self.adblock_engine.as_ref() {
1314 Some(custom) => custom,
1315 None => &AD_ENGINE,
1316 };
1317
1318 engine.check_network_request(&request).matched
1319 }
1320
1321 pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1322 let response = if self
1323 .attempted_authentications
1324 .contains(event.request_id.as_ref())
1325 {
1326 AuthChallengeResponseResponse::CancelAuth
1327 } else if self.credentials.is_some() {
1328 self.attempted_authentications
1329 .insert(event.request_id.clone().into());
1330 AuthChallengeResponseResponse::ProvideCredentials
1331 } else {
1332 AuthChallengeResponseResponse::Default
1333 };
1334
1335 let mut auth = AuthChallengeResponse::new(response);
1336 if let Some(creds) = self.credentials.clone() {
1337 auth.username = Some(creds.username);
1338 auth.password = Some(creds.password);
1339 }
1340 self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1341 }
1342
1343 pub fn set_offline_mode(&mut self, value: bool) {
1345 if self.offline == value {
1346 return;
1347 }
1348 self.offline = value;
1349 if let Ok(network) = EmulateNetworkConditionsByRuleParams::builder()
1350 .offline(self.offline)
1351 .matched_network_condition(
1352 NetworkConditions::builder()
1353 .url_pattern("")
1354 .latency(0)
1355 .download_throughput(-1.)
1356 .upload_throughput(-1.)
1357 .build()
1358 .unwrap(),
1359 )
1360 .build()
1361 {
1362 self.push_cdp_request(network);
1363 }
1364 }
1365
1366 pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1368 if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1369 if let Some((interception_id, _)) = self
1370 .request_id_to_interception_id
1371 .remove(event.request_id.as_ref())
1372 {
1373 self.on_request(event, Some(interception_id));
1374 } else {
1375 self.requests_will_be_sent
1376 .insert(event.request_id.clone(), (event.clone(), Instant::now()));
1377 }
1378 } else {
1379 self.on_request(event, None);
1380 }
1381 }
1382
1383 pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1385 if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1386 request.from_memory_cache = true;
1387 }
1388 }
1389
1390 pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1392 let mut request_failed = false;
1393
1394 let mut deducted: u64 = 0;
1396
1397 if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1398 let before = *max_bytes;
1399
1400 let received_bytes: u64 = event.response.encoded_data_length as u64;
1402
1403 let content_length: Option<u64> = event
1405 .response
1406 .headers
1407 .inner()
1408 .get("content-length")
1409 .and_then(|v| v.as_str())
1410 .and_then(|s| s.trim().parse::<u64>().ok());
1411
1412 *max_bytes = max_bytes.saturating_sub(received_bytes);
1414
1415 if let Some(cl) = content_length {
1417 if cl > *max_bytes {
1418 *max_bytes = 0;
1419 }
1420 }
1421
1422 request_failed = *max_bytes == 0;
1423
1424 deducted = before.saturating_sub(*max_bytes);
1426 }
1427
1428 if deducted > 0 {
1430 self.queued_events
1431 .push_back(NetworkEvent::BytesConsumed(deducted));
1432 }
1433
1434 if request_failed && self.max_bytes_allowed.is_some() {
1436 self.set_block_all(true);
1437 }
1438
1439 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1440 request.set_response(event.response.clone());
1441 self.queued_events.push_back(if request_failed {
1442 NetworkEvent::RequestFailed(request)
1443 } else {
1444 NetworkEvent::RequestFinished(request)
1445 });
1446 }
1447 }
1448
1449 pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1451 if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1452 if let Some(interception_id) = request.interception_id.as_ref() {
1453 self.attempted_authentications
1454 .remove(interception_id.as_ref());
1455 }
1456 self.queued_events
1457 .push_back(NetworkEvent::RequestFinished(request));
1458 }
1459 }
1460
1461 pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1463 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1464 request.failure_text = Some(event.error_text.clone());
1465 if let Some(interception_id) = request.interception_id.as_ref() {
1466 self.attempted_authentications
1467 .remove(interception_id.as_ref());
1468 }
1469 self.queued_events
1470 .push_back(NetworkEvent::RequestFailed(request));
1471 }
1472 }
1473
1474 fn on_request(
1476 &mut self,
1477 event: &EventRequestWillBeSent,
1478 interception_id: Option<InterceptionId>,
1479 ) {
1480 let mut redirect_chain = Vec::new();
1481 let mut redirect_location = None;
1482
1483 if let Some(redirect_resp) = &event.redirect_response {
1484 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1485 if is_redirect_status(redirect_resp.status) {
1486 if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1487 if redirect_resp.url != location {
1488 let fixed_location = location.replace(&redirect_resp.url, "");
1489
1490 if !fixed_location.is_empty() {
1491 if let Some(resp) = request.response.as_mut() {
1492 resp.headers.0["Location"] =
1493 serde_json::Value::String(fixed_location.clone());
1494 }
1495 }
1496
1497 redirect_location = Some(fixed_location);
1498 }
1499 }
1500 }
1501
1502 {
1503 let mut redirect_resp = redirect_resp.clone();
1504
1505 if let Some(redirect_location) = redirect_location {
1506 if !redirect_location.is_empty() {
1507 redirect_resp.headers.0["Location"] =
1508 serde_json::Value::String(redirect_location);
1509 }
1510 }
1511
1512 self.handle_request_redirect(&mut request, redirect_resp);
1513 }
1514
1515 redirect_chain = std::mem::take(&mut request.redirect_chain);
1516 redirect_chain.push(request);
1517 }
1518 }
1519
1520 let request = HttpRequest::new(
1521 event.request_id.clone(),
1522 event.frame_id.clone(),
1523 interception_id,
1524 self.user_request_interception_enabled,
1525 redirect_chain,
1526 );
1527
1528 self.requests.insert(event.request_id.clone(), request);
1529 self.queued_events
1530 .push_back(NetworkEvent::Request(event.request_id.clone()));
1531 }
1532
1533 fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1535 request.set_response(response);
1536 if let Some(interception_id) = request.interception_id.as_ref() {
1537 self.attempted_authentications
1538 .remove(interception_id.as_ref());
1539 }
1540 }
1541}
1542
1543#[derive(Debug)]
1544pub enum NetworkEvent {
1545 SendCdpRequest((MethodId, serde_json::Value)),
1547 Request(RequestId),
1549 Response(RequestId),
1551 RequestFailed(HttpRequest),
1553 RequestFinished(HttpRequest),
1555 BytesConsumed(u64),
1557}
1558
1559#[cfg(test)]
1560mod tests {
1561 use super::ALLOWED_MATCHER_3RD_PARTY;
1562 use crate::handler::network::NetworkManager;
1563 use std::time::Duration;
1564
1565 #[test]
1566 fn test_allowed_matcher_3rd_party() {
1567 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1569 assert!(
1570 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1571 "expected Cloudflare challenge script to be allowed"
1572 );
1573
1574 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1576 assert!(
1577 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1578 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1579 );
1580
1581 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1583 assert!(ALLOWED_MATCHER_3RD_PARTY
1584 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1585 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1586 }
1587
1588 #[test]
1589 fn test_script_allowed_by_default_when_not_blocklisted() {
1590 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1591 nm.set_page_url(
1592 "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1593 );
1594
1595 let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1597 assert!(
1598 !nm.should_block_script_blocklist_only(ok),
1599 "expected non-blocklisted script to be allowed"
1600 );
1601 }
1602
1603 #[test]
1604 fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1605 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1606 nm.set_page_url(
1607 "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1608 );
1609
1610 let bad = "https://cdn.example.net/js/analytics.js";
1612 assert!(
1613 nm.should_block_script_blocklist_only(bad),
1614 "expected analytics.js to be blocklisted"
1615 );
1616 }
1617
1618 #[test]
1619 fn test_allowed_matcher_3rd_party_sanity() {
1620 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1622 assert!(
1623 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1624 "expected Cloudflare challenge script to be allowed"
1625 );
1626
1627 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1629 assert!(
1630 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1631 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1632 );
1633
1634 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1635 assert!(ALLOWED_MATCHER_3RD_PARTY
1636 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1637 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1638 }
1639 #[test]
1640 fn test_dynamic_blacklist_blocks_url() {
1641 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1642 nm.set_page_url("https://example.com/".to_string());
1643
1644 nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1645 assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1646 assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1647
1648 assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1649 }
1650
1651 #[test]
1652 fn test_blacklist_strict_wins_over_whitelist() {
1653 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1654 nm.set_page_url("https://example.com/".to_string());
1655
1656 nm.set_blacklist_patterns(["beacon.min.js"]);
1658 nm.set_whitelist_patterns(["beacon.min.js"]);
1659
1660 nm.set_blacklist_strict(true);
1661
1662 let u = "https://static.cloudflareinsights.com/beacon.min.js";
1663 assert!(nm.is_whitelisted(u));
1664 assert!(nm.is_blacklisted(u));
1665
1666 assert!(nm.blacklist_strict);
1669 }
1670
1671 #[cfg(feature = "adblock")]
1672 fn make_request_paused(
1673 url: &str,
1674 resource_type: chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType,
1675 is_same_site: bool,
1676 ) -> chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused {
1677 use chromiumoxide_cdp::cdp::browser_protocol::network::{
1678 Headers, Request, ResourcePriority, RequestReferrerPolicy,
1679 };
1680 use chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused;
1681
1682 EventRequestPaused {
1683 request_id: chromiumoxide_cdp::cdp::browser_protocol::network::RequestId::from(
1684 "test-req".to_string(),
1685 )
1686 .into(),
1687 request: Request {
1688 url: url.to_string(),
1689 method: "GET".to_string(),
1690 headers: Headers::new(serde_json::Value::Object(Default::default())),
1691 initial_priority: ResourcePriority::Medium,
1692 referrer_policy: RequestReferrerPolicy::NoReferrer,
1693 url_fragment: None,
1694 has_post_data: None,
1695 post_data_entries: None,
1696 mixed_content_type: None,
1697 is_link_preload: None,
1698 trust_token_params: None,
1699 is_same_site: Some(is_same_site),
1700 is_ad_related: None,
1701 },
1702 frame_id: chromiumoxide_cdp::cdp::browser_protocol::page::FrameId::from(
1703 "frame1".to_string(),
1704 ),
1705 resource_type,
1706 response_error_reason: None,
1707 response_status_code: None,
1708 response_status_text: None,
1709 response_headers: None,
1710 network_id: None,
1711 redirected_request_id: None,
1712 }
1713 }
1714
1715 #[cfg(feature = "adblock")]
1716 #[test]
1717 fn test_detect_ad_blocks_known_tracker_scripts() {
1718 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1719
1720 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1721 nm.set_page_url("https://www.wine-searcher.com/".to_string());
1722
1723 let event = make_request_paused(
1724 "https://www.googletagmanager.com/gtm.js?id=GTM-XXXX",
1725 ResourceType::Script,
1726 false,
1727 );
1728
1729 assert!(
1730 nm.detect_ad(&event),
1731 "googletagmanager.com script should be detected as ad"
1732 );
1733 }
1734
1735 #[cfg(feature = "adblock")]
1736 #[test]
1737 fn test_detect_ad_allows_legitimate_scripts() {
1738 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1739
1740 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1741 nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
1742
1743 let event = make_request_paused(
1744 "https://www.mylegitsite-test.com/static/js/app-bundle.js",
1745 ResourceType::Script,
1746 true,
1747 );
1748
1749 assert!(
1750 !nm.detect_ad(&event),
1751 "legitimate first-party app bundle should not be blocked"
1752 );
1753 }
1754
1755 #[cfg(feature = "adblock")]
1756 #[test]
1757 fn test_detect_ad_uses_source_domain() {
1758 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1759
1760 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1761 nm.set_page_url("https://www.wine-searcher.com/some-page".to_string());
1762
1763 assert!(
1764 !nm.document_target_domain.is_empty(),
1765 "document_target_domain should be set after set_page_url"
1766 );
1767
1768 let event = make_request_paused(
1769 "https://www.google-analytics.com/analytics.js",
1770 ResourceType::Script,
1771 false,
1772 );
1773
1774 assert!(
1775 nm.detect_ad(&event),
1776 "google-analytics.com should be blocked as tracker"
1777 );
1778 }
1779
1780 #[cfg(feature = "adblock")]
1781 #[test]
1782 fn test_custom_adblock_engine_takes_precedence() {
1783 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1784
1785 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1786 nm.set_page_url("https://example.com/".to_string());
1787
1788 let mut filter_set = adblock::lists::FilterSet::new(false);
1790 let mut opts = adblock::lists::ParseOptions::default();
1791 opts.rule_types = adblock::lists::RuleTypes::All;
1792 filter_set.add_filters(["||custom-tracker.example.net^"], opts);
1793 let engine = adblock::Engine::from_filter_set(filter_set, true);
1794 nm.set_adblock_engine(std::sync::Arc::new(engine));
1795
1796 let event = make_request_paused(
1797 "https://custom-tracker.example.net/pixel.js",
1798 ResourceType::Script,
1799 false,
1800 );
1801
1802 assert!(
1803 nm.detect_ad(&event),
1804 "custom engine rule should block custom-tracker.example.net"
1805 );
1806 }
1807
1808 #[cfg(feature = "adblock")]
1811 fn run_full_interception(nm: &mut NetworkManager, url: &str, resource_type: chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType, is_same_site: bool) -> bool {
1812 use super::NetworkEvent;
1813
1814 while nm.poll().is_some() {}
1816
1817 let event = make_request_paused(url, resource_type, is_same_site);
1818 nm.on_fetch_request_paused(&event);
1819
1820 let mut blocked = false;
1822 while let Some(ev) = nm.poll() {
1823 if let NetworkEvent::SendCdpRequest((method, _)) = &ev {
1824 let m: &str = method.as_ref();
1825 if m == "Fetch.fulfillRequest" || m == "Fetch.failRequest" {
1826 blocked = true;
1827 }
1828 }
1829 }
1830 blocked
1831 }
1832
1833 #[cfg(feature = "adblock")]
1836 #[test]
1837 fn test_e2e_tracker_script_blocked() {
1838 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1839
1840 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1841 nm.set_page_url("https://www.wine-searcher.com/".to_string());
1842
1843 assert!(
1844 run_full_interception(
1845 &mut nm,
1846 "https://www.googletagmanager.com/gtm.js?id=GTM-XXXX",
1847 ResourceType::Script,
1848 false,
1849 ),
1850 "GTM script should be blocked through full pipeline"
1851 );
1852 }
1853
1854 #[cfg(feature = "adblock")]
1855 #[test]
1856 fn test_e2e_legitimate_script_allowed() {
1857 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1858
1859 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1860 nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
1861
1862 assert!(
1863 !run_full_interception(
1864 &mut nm,
1865 "https://www.mylegitsite-test.com/static/js/app-bundle.js",
1866 ResourceType::Script,
1867 true,
1868 ),
1869 "legitimate first-party script should be allowed through full pipeline"
1870 );
1871 }
1872
1873 #[cfg(feature = "adblock")]
1874 #[test]
1875 fn test_e2e_analytics_xhr_blocked() {
1876 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1877
1878 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1879 nm.set_page_url("https://example.org/".to_string());
1880
1881 assert!(
1882 run_full_interception(
1883 &mut nm,
1884 "https://www.google-analytics.com/g/collect?v=2&tid=UA-123",
1885 ResourceType::Xhr,
1886 false,
1887 ),
1888 "Google Analytics XHR should be blocked through full pipeline"
1889 );
1890 }
1891
1892 #[cfg(feature = "adblock")]
1893 #[test]
1894 fn test_e2e_whitelisted_overrides_adblock() {
1895 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1896
1897 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1898 nm.set_page_url("https://example.org/".to_string());
1899 nm.set_whitelist_patterns(["googletagmanager.com"]);
1900
1901 assert!(
1903 !run_full_interception(
1904 &mut nm,
1905 "https://www.googletagmanager.com/gtm.js?id=GTM-TEST",
1906 ResourceType::Script,
1907 false,
1908 ),
1909 "whitelisted tracker should be allowed even when adblock would block it"
1910 );
1911 }
1912
1913 #[cfg(feature = "adblock")]
1914 #[test]
1915 fn test_e2e_blacklist_strict_overrides_whitelist() {
1916 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1917
1918 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1919 nm.set_page_url("https://example.org/".to_string());
1920 nm.set_blacklist_patterns(["cdn.example.net/evil.js"]);
1921 nm.set_whitelist_patterns(["cdn.example.net/evil.js"]);
1922 nm.set_blacklist_strict(true);
1923
1924 assert!(
1925 run_full_interception(
1926 &mut nm,
1927 "https://cdn.example.net/evil.js",
1928 ResourceType::Script,
1929 false,
1930 ),
1931 "strict blacklist should win over whitelist"
1932 );
1933 }
1934
1935 #[cfg(feature = "adblock")]
1936 #[test]
1937 fn test_e2e_first_party_document_not_blocked() {
1938 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1939
1940 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1941 nm.set_page_url("https://www.nytimes.com/".to_string());
1942
1943 assert!(
1944 !run_full_interception(
1945 &mut nm,
1946 "https://www.nytimes.com/2024/article.html",
1947 ResourceType::Document,
1948 true,
1949 ),
1950 "first-party document navigation should never be blocked"
1951 );
1952 }
1953
1954 #[cfg(feature = "adblock")]
1955 #[test]
1956 fn test_e2e_custom_engine_blocks_through_pipeline() {
1957 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1958
1959 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1960 nm.set_page_url("https://mysite.com/".to_string());
1961
1962 let mut filter_set = adblock::lists::FilterSet::new(false);
1963 let mut opts = adblock::lists::ParseOptions::default();
1964 opts.rule_types = adblock::lists::RuleTypes::All;
1965 filter_set.add_filters(["||evil-cdn.example.net^$script"], opts);
1966 let engine = adblock::Engine::from_filter_set(filter_set, true);
1967 nm.set_adblock_engine(std::sync::Arc::new(engine));
1968
1969 assert!(
1970 run_full_interception(
1971 &mut nm,
1972 "https://evil-cdn.example.net/tracker.js",
1973 ResourceType::Script,
1974 false,
1975 ),
1976 "custom engine rule should block through full pipeline"
1977 );
1978
1979 assert!(
1981 !run_full_interception(
1982 &mut nm,
1983 "https://mysite.com/app.js",
1984 ResourceType::Script,
1985 true,
1986 ),
1987 "first-party script should still be allowed with custom engine"
1988 );
1989 }
1990
1991 #[cfg(feature = "adblock")]
1992 #[test]
1993 fn test_e2e_ad_image_blocked() {
1994 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1995
1996 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1997 nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
1998
1999 assert!(
2001 run_full_interception(
2002 &mut nm,
2003 "https://googleads.g.doubleclick.net/pagead/viewthroughconversion/123/?random=456",
2004 ResourceType::Image,
2005 false,
2006 ),
2007 "doubleclick ad image/tracking pixel should be blocked"
2008 );
2009
2010 assert!(
2012 !run_full_interception(
2013 &mut nm,
2014 "https://www.mylegitsite-test.com/images/logo.png",
2015 ResourceType::Image,
2016 true,
2017 ),
2018 "legitimate first-party image should not be blocked"
2019 );
2020 }
2021
2022 #[cfg(feature = "adblock")]
2023 #[test]
2024 fn test_e2e_hostname_with_userinfo() {
2025 use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2026
2027 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2028 nm.set_page_url("https://example.org/".to_string());
2029
2030 assert!(
2032 run_full_interception(
2033 &mut nm,
2034 "https://user:pass@www.googletagmanager.com/gtm.js?id=GTM-XXXX",
2035 ResourceType::Script,
2036 false,
2037 ),
2038 "tracker URL with userinfo should still be blocked"
2039 );
2040 }
2041
2042 #[test]
2043 fn test_blacklist_non_strict_allows_whitelist_override() {
2044 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2045 nm.set_page_url("https://example.com/".to_string());
2046
2047 nm.set_blacklist_patterns(["beacon.min.js"]);
2048 nm.set_whitelist_patterns(["beacon.min.js"]);
2049
2050 nm.set_blacklist_strict(false);
2051
2052 let u = "https://static.cloudflareinsights.com/beacon.min.js";
2053 assert!(nm.is_blacklisted(u));
2054 assert!(nm.is_whitelisted(u));
2055 assert!(!nm.blacklist_strict);
2056 }
2057}