1use super::blockers::{
2 block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
3 xhr::IGNORE_XHR_ASSETS,
4};
5use crate::auth::Credentials;
6#[cfg(feature = "_cache")]
7use crate::cache::BasicCachePolicy;
8use crate::cmd::CommandChain;
9use crate::handler::http::HttpRequest;
10use crate::handler::network_utils::{
11 base_domain_from_any, base_domain_from_host, first_label, host_and_rest,
12 host_contains_label_icase, host_is_subdomain_of,
13};
14use aho_corasick::AhoCorasick;
15use case_insensitive_string::CaseInsensitiveString;
16use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
17use chromiumoxide_cdp::cdp::browser_protocol::network::{
18 EmulateNetworkConditionsParams, EventLoadingFailed, EventLoadingFinished,
19 EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
20 InterceptionId, RequestId, ResourceType, Response, SetCacheDisabledParams,
21 SetExtraHttpHeadersParams,
22};
23use chromiumoxide_cdp::cdp::browser_protocol::{
24 fetch::{
25 self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
26 ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
27 },
28 network::SetBypassServiceWorkerParams,
29};
30use chromiumoxide_cdp::cdp::browser_protocol::{
31 network::EnableParams, security::SetIgnoreCertificateErrorsParams,
32};
33use chromiumoxide_types::{Command, Method, MethodId};
34use hashbrown::{HashMap, HashSet};
35use lazy_static::lazy_static;
36use reqwest::header::PROXY_AUTHORIZATION;
37use spider_network_blocker::intercept_manager::NetworkInterceptManager;
38pub use spider_network_blocker::scripts::{
39 URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
40};
41use std::borrow::Cow;
42use std::collections::VecDeque;
43use std::time::Duration;
44
45lazy_static! {
46 static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
48 "jquery", "angular",
50 "react", "vue", "bootstrap",
53 "d3",
54 "lodash",
55 "ajax",
56 "application",
57 "app", "main",
59 "index",
60 "bundle",
61 "vendor",
62 "runtime",
63 "polyfill",
64 "scripts",
65 "es2015.",
66 "es2020.",
67 "webpack",
68 "/cdn-cgi/challenge-platform/",
69 "/wp-content/js/", "https://m.stripe.network/",
72 "https://challenges.cloudflare.com/",
73 "https://www.google.com/recaptcha/enterprise.js",
74 "https://www.google.com/recaptcha/api.js",
75 "https://google.com/recaptcha/api.js",
76 "https://captcha.px-cloud.net/",
77 "https://cdn.auth0.com/js/lock/",
78 "https://cdn.auth0.com/client",
79 "https://js.stripe.com/",
80 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-"
83 ];
84
85 pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
87
88 static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
90 "https://m.stripe.network/",
92 "https://challenges.cloudflare.com/",
93 "https://www.google.com/recaptcha/api.js",
94 "https://google.com/recaptcha/api.js",
95 "https://www.google.com/recaptcha/enterprise.js",
96 "https://js.stripe.com/",
97 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-",
100 "https://ct.captcha-delivery.com/",
101 "https://geo.captcha-delivery.com/captcha/",
102 "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", "https://ct.captcha-delivery.com/",
104 "https://cdn.auth0.com/client",
105 "https://captcha.px-cloud.net/",
106 "https://static.intercomassets.com/", "/cdn-cgi/challenge-platform/"
108 ];
109
110 pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
112
113 pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
115 phf::phf_set! {
116 "_astro/", "_app/immutable"
118 }
119 };
120
121 pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
123 "application/pdf",
124 "application/zip",
125 "application/x-rar-compressed",
126 "application/x-tar",
127 "image/png",
128 "image/jpeg",
129 "image/gif",
130 "image/bmp",
131 "image/webp",
132 "image/svg+xml",
133 "video/mp4",
134 "video/x-msvideo",
135 "video/x-matroska",
136 "video/webm",
137 "audio/mpeg",
138 "audio/ogg",
139 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
140 "application/vnd.ms-excel",
141 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
142 "application/vnd.ms-powerpoint",
143 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
144 "application/x-7z-compressed",
145 "application/x-rpm",
146 "application/x-shockwave-flash",
147 "application/rtf",
148 };
149
150 pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
152 "Image",
153 "Media",
154 "Font"
155 };
156
157 pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
159 "CspViolationReport",
160 "Manifest",
161 "Other",
162 "Prefetch",
163 "Ping",
164 };
165
166 pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
168
169 pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
171 let enable = EnableParams::default();
172
173 if let Ok(c) = serde_json::to_value(&enable) {
174 vec![(enable.identifier(), c)]
175 } else {
176 vec![]
177 }
178 };
179
180 pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
182 let enable = EnableParams::default();
183 let mut v = vec![];
184 if let Ok(c) = serde_json::to_value(&enable) {
185 v.push((enable.identifier(), c));
186 }
187 let ignore = SetIgnoreCertificateErrorsParams::new(true);
188 if let Ok(ignored) = serde_json::to_value(&ignore) {
189 v.push((ignore.identifier(), ignored));
190 }
191
192 v
193 };
194
195 pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
197 fetch::EnableParams::builder()
198 .handle_auth_requests(true)
199 .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
200 .build()
201 };
202}
203
204pub(crate) fn is_redirect_status(status: i64) -> bool {
206 matches!(status, 301 | 302 | 303 | 307 | 308)
207}
208
209#[derive(Debug)]
210pub struct NetworkManager {
212 queued_events: VecDeque<NetworkEvent>,
213 ignore_httpserrors: bool,
214 requests: HashMap<RequestId, HttpRequest>,
215 requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
217 extra_headers: std::collections::HashMap<String, String>,
218 request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
219 user_cache_disabled: bool,
220 attempted_authentications: HashSet<RequestId>,
221 credentials: Option<Credentials>,
222 pub(crate) user_request_interception_enabled: bool,
223 block_all: bool,
224 pub(crate) protocol_request_interception_enabled: bool,
225 offline: bool,
227 pub request_timeout: Duration,
229 pub ignore_visuals: bool,
232 pub block_stylesheets: bool,
234 pub block_javascript: bool,
236 pub block_analytics: bool,
238 pub only_html: bool,
240 pub xml_document: bool,
242 pub intercept_manager: NetworkInterceptManager,
244 pub document_reload_tracker: u8,
246 pub document_target_url: String,
248 pub document_target_domain: String,
250 pub max_bytes_allowed: Option<u64>,
252 #[cfg(feature = "_cache")]
253 pub cache_site_key: Option<String>,
255 #[cfg(feature = "_cache")]
257 pub cache_policy: Option<BasicCachePolicy>,
258 whitelist_patterns: Vec<String>,
260 whitelist_matcher: Option<AhoCorasick>,
262}
263
264impl NetworkManager {
265 pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
267 Self {
268 queued_events: Default::default(),
269 ignore_httpserrors,
270 requests: Default::default(),
271 requests_will_be_sent: Default::default(),
272 extra_headers: Default::default(),
273 request_id_to_interception_id: Default::default(),
274 user_cache_disabled: false,
275 attempted_authentications: Default::default(),
276 credentials: None,
277 block_all: false,
278 user_request_interception_enabled: false,
279 protocol_request_interception_enabled: false,
280 offline: false,
281 request_timeout,
282 ignore_visuals: false,
283 block_javascript: false,
284 block_stylesheets: false,
285 block_analytics: true,
286 only_html: false,
287 xml_document: false,
288 intercept_manager: NetworkInterceptManager::Unknown,
289 document_reload_tracker: 0,
290 document_target_url: String::new(),
291 document_target_domain: String::new(),
292 whitelist_patterns: Vec::new(),
293 whitelist_matcher: None,
294 max_bytes_allowed: None,
295 #[cfg(feature = "_cache")]
296 cache_site_key: None,
297 #[cfg(feature = "_cache")]
298 cache_policy: None,
299 }
300 }
301
302 pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
304 where
305 I: IntoIterator<Item = S>,
306 S: Into<String>,
307 {
308 self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
309 self.rebuild_whitelist_matcher();
310 }
311
312 pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
314 self.whitelist_patterns.push(pattern.into());
315 self.rebuild_whitelist_matcher();
316 }
317
318 pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
320 where
321 I: IntoIterator<Item = S>,
322 S: Into<String>,
323 {
324 self.whitelist_patterns
325 .extend(patterns.into_iter().map(Into::into));
326 self.rebuild_whitelist_matcher();
327 }
328
329 #[inline]
330 fn rebuild_whitelist_matcher(&mut self) {
331 if self.whitelist_patterns.is_empty() {
332 self.whitelist_matcher = None;
333 return;
334 }
335
336 let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
337
338 self.whitelist_matcher = AhoCorasick::new(refs).ok();
340 }
341
342 #[inline]
343 fn is_whitelisted(&self, url: &str) -> bool {
344 self.whitelist_matcher
345 .as_ref()
346 .map(|m| m.is_match(url))
347 .unwrap_or(false)
348 }
349
350 pub fn init_commands(&self) -> CommandChain {
352 let cmds = if self.ignore_httpserrors {
353 INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
354 } else {
355 INIT_CHAIN.clone()
356 };
357 CommandChain::new(cmds, self.request_timeout)
358 }
359
360 pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
362 let method = cmd.identifier();
363 if let Ok(params) = serde_json::to_value(cmd) {
364 self.queued_events
365 .push_back(NetworkEvent::SendCdpRequest((method, params)));
366 }
367 }
368
369 pub fn poll(&mut self) -> Option<NetworkEvent> {
371 self.queued_events.pop_front()
372 }
373
374 pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
376 &self.extra_headers
377 }
378
379 pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
381 self.extra_headers = headers;
382 self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
383 self.extra_headers.remove("Proxy-Authorization");
384 if !self.extra_headers.is_empty() {
385 if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
386 self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
387 }
388 }
389 }
390
391 pub fn set_service_worker_enabled(&mut self, bypass: bool) {
392 self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
393 }
394
395 pub fn set_block_all(&mut self, block_all: bool) {
396 self.block_all = block_all;
397 }
398
399 pub fn set_request_interception(&mut self, enabled: bool) {
400 self.user_request_interception_enabled = enabled;
401 self.update_protocol_request_interception();
402 }
403
404 pub fn set_cache_enabled(&mut self, enabled: bool) {
405 let run = self.user_cache_disabled != !enabled;
406 self.user_cache_disabled = !enabled;
407 if run {
408 self.update_protocol_cache_disabled();
409 }
410 }
411
412 pub fn enable_request_intercept(&mut self) {
414 self.protocol_request_interception_enabled = true;
415 }
416
417 pub fn disable_request_intercept(&mut self) {
419 self.protocol_request_interception_enabled = false;
420 }
421
422 #[cfg(feature = "_cache")]
424 pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
425 self.cache_site_key = cache_site_key;
426 }
427
428 #[cfg(feature = "_cache")]
430 pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
431 self.cache_policy = cache_policy;
432 }
433
434 pub fn update_protocol_cache_disabled(&mut self) {
435 self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
436 }
437
438 pub fn authenticate(&mut self, credentials: Credentials) {
439 self.credentials = Some(credentials);
440 self.update_protocol_request_interception();
441 self.protocol_request_interception_enabled = true;
442 }
443
444 fn update_protocol_request_interception(&mut self) {
445 let enabled = self.user_request_interception_enabled || self.credentials.is_some();
446
447 if enabled == self.protocol_request_interception_enabled {
448 return;
449 }
450
451 if enabled {
452 self.push_cdp_request(ENABLE_FETCH.clone())
453 } else {
454 self.push_cdp_request(DisableParams::default())
455 }
456 }
457
458 #[inline]
459 fn rel_for_ignore_script<'a>(&self, url: &'a str) -> Cow<'a, str> {
460 if url.starts_with('/') {
461 return Cow::Borrowed(url);
462 }
463
464 let base_raw = self.document_target_domain.as_str();
465
466 if base_raw.is_empty() {
467 return Cow::Borrowed(url);
468 }
469
470 let base = base_domain_from_any(base_raw).trim_end_matches('.');
471 if base.is_empty() {
472 return Cow::Borrowed(url);
473 }
474
475 let brand = first_label(base);
476
477 if let Some((host, rest)) = host_and_rest(url) {
478 if host_is_subdomain_of(host, base) || host_contains_label_icase(host, brand) {
479 return if rest.starts_with('/') {
480 Cow::Borrowed(rest)
481 } else {
482 Cow::Borrowed("/")
483 };
484 }
485 }
486
487 Cow::Borrowed(url)
488 }
489
490 #[inline]
492 pub(crate) fn ignore_script(
493 &self,
494 url: &str,
495 block_analytics: bool,
496 intercept_manager: NetworkInterceptManager,
497 ) -> bool {
498 let mut ignore_script = !url.starts_with("/");
500
501 if !ignore_script
502 && block_analytics
503 && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
504 {
505 ignore_script = true;
506 }
507
508 if !ignore_script {
509 if let Some(index) = url.find("//") {
510 let pos = index + 2;
511
512 if pos < url.len() {
514 if let Some(slash_index) = url[pos..].find('/') {
516 let base_path_index = pos + slash_index + 1;
517
518 if url.len() > base_path_index {
519 let new_url: &str = &url[base_path_index..];
520
521 if !ignore_script
523 && intercept_manager == NetworkInterceptManager::Unknown
524 {
525 let hydration_file =
526 JS_FRAMEWORK_PATH.iter().any(|p| new_url.starts_with(p));
527
528 if hydration_file && new_url.ends_with(".js") {
530 ignore_script = true;
531 }
532 }
533
534 if !ignore_script
535 && URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(new_url)
536 {
537 ignore_script = true;
538 }
539
540 if !ignore_script
541 && self.ignore_visuals
542 && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(new_url)
543 {
544 ignore_script = true;
545 }
546 }
547 }
548 }
549 }
550 }
551
552 if !ignore_script && block_analytics {
554 ignore_script = URL_IGNORE_TRIE_PATHS.contains_prefix(url);
555 }
556
557 ignore_script
558 }
559
560 #[inline]
562 fn skip_xhr(
563 &self,
564 skip_networking: bool,
565 event: &EventRequestPaused,
566 network_event: bool,
567 ) -> bool {
568 if !skip_networking && network_event {
570 let request_url = event.request.url.as_str();
571
572 let skip_analytics =
574 self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
575
576 if skip_analytics {
577 true
578 } else if self.block_stylesheets || self.ignore_visuals {
579 let block_css = self.block_stylesheets;
580 let block_media = self.ignore_visuals;
581
582 let mut block_request = false;
583
584 if let Some(position) = request_url.rfind('.') {
585 let hlen = request_url.len();
586 let has_asset = hlen - position;
587
588 if has_asset >= 3 {
589 let next_position = position + 1;
590
591 if block_media
592 && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
593 &request_url[next_position..].into(),
594 )
595 {
596 block_request = true;
597 } else if block_css {
598 block_request =
599 CaseInsensitiveString::from(request_url[next_position..].as_bytes())
600 .contains(&**CSS_EXTENSION)
601 }
602 }
603 }
604
605 if !block_request {
606 block_request = ignore_script_xhr_media(request_url);
607 }
608
609 block_request
610 } else {
611 skip_networking
612 }
613 } else {
614 skip_networking
615 }
616 }
617
618 #[cfg(feature = "adblock")]
619 #[inline]
620 fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
622 if skip_networking {
623 true
624 } else {
625 self.detect_ad(event)
626 }
627 }
628
629 #[cfg(not(feature = "adblock"))]
631 #[inline]
632 fn detect_ad_if_enabled(&mut self, _event: &EventRequestPaused, skip_networking: bool) -> bool {
633 skip_networking
634 }
635
636 #[inline]
637 fn fail_request_blocked(
639 &mut self,
640 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
641 ) {
642 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
643 request_id.clone(),
644 chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
645 );
646 self.push_cdp_request(params);
647 }
648
649 #[inline]
650 fn fulfill_request_empty_200(
652 &mut self,
653 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
654 ) {
655 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
656 request_id.clone(),
657 200,
658 );
659 self.push_cdp_request(params);
660 }
661
662 #[cfg(feature = "_cache")]
663 #[inline]
664 fn fulfill_request_from_cache(
668 &mut self,
669 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
670 body: &[u8],
671 headers: &std::collections::HashMap<String, String>,
672 status: i64,
673 ) {
674 use crate::cdp::browser_protocol::fetch::HeaderEntry;
675 use crate::handler::network::fetch::FulfillRequestParams;
676 use base64::Engine;
677
678 let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
679
680 for (k, v) in headers.iter() {
681 resp_headers.push(HeaderEntry {
682 name: k.clone().into(),
683 value: v.clone().into(),
684 });
685 }
686
687 let mut params = FulfillRequestParams::new(request_id.clone(), status);
688
689 params.body = Some(
691 base64::engine::general_purpose::STANDARD
692 .encode(body)
693 .into(),
694 );
695
696 params.response_headers = Some(resp_headers);
697
698 self.push_cdp_request(params);
699 }
700
701 #[inline]
702 fn continue_request_with_url(
704 &mut self,
705 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
706 url: Option<&str>,
707 intercept_response: bool,
708 ) {
709 let mut params = ContinueRequestParams::new(request_id.clone());
710 if let Some(url) = url {
711 params.url = Some(url.to_string());
712 params.intercept_response = Some(intercept_response);
713 }
714 self.push_cdp_request(params);
715 }
716
717 #[inline]
719 pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
720 if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
721 return;
722 }
723
724 let resource_type = &event.resource_type;
725
726 if self.block_all {
727 tracing::debug!(
728 "Blocked (block_all): {:?} - {}",
729 event.resource_type,
730 event.request.url
731 );
732 return self.fail_request_blocked(&event.request_id);
733 }
734
735 if let Some(network_id) = event.network_id.as_ref() {
741 if let Some(request_will_be_sent) =
742 self.requests_will_be_sent.remove(network_id.as_ref())
743 {
744 self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
745 } else {
746 self.request_id_to_interception_id
747 .insert(network_id.clone(), event.request_id.clone().into());
748 }
749 }
750
751 let javascript_resource = *resource_type == ResourceType::Script;
753 let document_resource = *resource_type == ResourceType::Document;
754 let network_resource = !document_resource && crate::utils::is_data_resource(resource_type);
755
756 let mut skip_networking =
758 self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(resource_type.as_ref());
759
760 if !skip_networking {
762 skip_networking = self.document_reload_tracker >= 3;
763 }
764
765 let (current_url_cow, had_replacer) =
767 self.handle_document_replacement_and_tracking(event, document_resource);
768
769 let current_url: &str = current_url_cow.as_ref();
770
771 if !skip_networking {
773 if self.xml_document && current_url.ends_with(".xsl") {
775 skip_networking = false;
776 } else {
777 skip_networking = self.should_skip_for_visuals_and_basic_js(
778 resource_type,
779 javascript_resource,
780 current_url,
781 );
782 }
783 }
784
785 skip_networking = self.detect_ad_if_enabled(event, skip_networking);
787
788 if !skip_networking
790 && (self.only_html || self.ignore_visuals)
791 && (javascript_resource || document_resource)
792 {
793 skip_networking = ignore_script_embedded(current_url);
794 }
795
796 if skip_networking && javascript_resource {
798 let rel = self.rel_for_ignore_script(current_url);
799 skip_networking =
800 self.ignore_script(rel.as_ref(), self.block_analytics, self.intercept_manager);
801 }
802
803 skip_networking = self.skip_xhr(skip_networking, event, network_resource);
805
806 if !skip_networking && (javascript_resource || network_resource || document_resource) {
808 skip_networking = self.intercept_manager.intercept_detection(
809 current_url,
810 self.ignore_visuals,
811 network_resource,
812 );
813 }
814
815 if !skip_networking && (javascript_resource || network_resource) {
817 skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
818 }
819
820 if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
822 {
823 skip_networking = false;
824 }
825
826 if skip_networking && self.is_whitelisted(current_url) {
828 skip_networking = false;
829 }
830
831 if skip_networking {
832 tracing::debug!("Blocked: {:?} - {}", resource_type, current_url);
833 self.fulfill_request_empty_200(&event.request_id);
834 } else {
835 #[cfg(feature = "_cache")]
836 {
837 if let (Some(policy), Some(cache_site_key)) =
838 (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
839 {
840 let current_url = format!("{}:{}", event.request.method, ¤t_url);
841
842 if let Some((res, cache_policy)) =
843 crate::cache::remote::get_session_cache_item(cache_site_key, ¤t_url)
844 {
845 if policy.allows_cached(&cache_policy) {
846 tracing::debug!(
847 "Remote Cached: {:?} - {}",
848 resource_type,
849 ¤t_url
850 );
851 return self.fulfill_request_from_cache(
852 &event.request_id,
853 &res.body,
854 &res.headers,
855 res.status as i64,
856 );
857 }
858 }
859 }
860 }
861
862 tracing::debug!("Allowed: {:?} - {}", resource_type, current_url);
864 self.continue_request_with_url(
865 &event.request_id,
866 if had_replacer {
867 Some(current_url)
868 } else {
869 None
870 },
871 !had_replacer,
872 );
873 }
874 }
875
876 pub fn has_target_domain(&self) -> bool {
878 !self.document_target_url.is_empty()
879 }
880
881 pub fn set_page_url(&mut self, page_target_url: String) {
883 let host_base = host_and_rest(&page_target_url)
884 .map(|(h, _)| base_domain_from_host(h))
885 .unwrap_or("");
886
887 self.document_target_domain = host_base.to_string();
888 self.document_target_url = page_target_url;
889 }
890
891 pub fn clear_target_domain(&mut self) {
893 self.document_reload_tracker = 0;
894 self.document_target_url = Default::default();
895 self.document_target_domain = Default::default();
896 }
897 #[inline]
905 fn handle_document_replacement_and_tracking<'a>(
906 &mut self,
907 event: &'a EventRequestPaused,
908 document_resource: bool,
909 ) -> (Cow<'a, str>, bool) {
910 let mut replacer: Option<String> = None;
911 let current_url = event.request.url.as_str();
912
913 if document_resource {
914 if self.document_target_url == current_url {
915 self.document_reload_tracker += 1;
916 } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
917 {
918 let (http_document_replacement, mut https_document_replacement) =
919 if self.document_target_url.starts_with("http://") {
920 (
921 self.document_target_url.replacen("http://", "http//", 1),
922 self.document_target_url.replacen("http://", "https://", 1),
923 )
924 } else {
925 (
926 self.document_target_url.replacen("https://", "https//", 1),
927 self.document_target_url.replacen("https://", "http://", 1),
928 )
929 };
930
931 let trailing = https_document_replacement.ends_with('/');
933 if trailing {
934 https_document_replacement.pop();
935 }
936 if https_document_replacement.ends_with('/') {
937 https_document_replacement.pop();
938 }
939
940 let redirect_mask = format!(
941 "{}{}",
942 https_document_replacement, http_document_replacement
943 );
944
945 if current_url == redirect_mask {
946 replacer = Some(if trailing {
947 format!("{}/", https_document_replacement)
948 } else {
949 https_document_replacement
950 });
951 }
952 }
953
954 if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
955 self.xml_document = true;
956 }
957
958 self.document_target_url = event.request.url.clone();
960 self.document_target_domain = host_and_rest(&self.document_target_url)
961 .map(|(h, _)| base_domain_from_host(h).to_string())
962 .unwrap_or_default();
963 }
964
965 let current_url_cow = match replacer {
966 Some(r) => Cow::Owned(r),
967 None => Cow::Borrowed(event.request.url.as_str()),
968 };
969
970 let had_replacer = matches!(current_url_cow, Cow::Owned(_));
971 (current_url_cow, had_replacer)
972 }
973
974 #[inline]
976 fn should_skip_for_visuals_and_basic_js(
977 &self,
978 resource_type: &ResourceType,
979 javascript_resource: bool,
980 current_url: &str,
981 ) -> bool {
982 (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
983 || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
984 || (self.block_javascript
985 && javascript_resource
986 && self.intercept_manager == NetworkInterceptManager::Unknown
987 && !ALLOWED_MATCHER.is_match(current_url))
988 }
989
990 #[cfg(feature = "adblock")]
992 pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
993 use adblock::{
994 lists::{FilterSet, ParseOptions, RuleTypes},
995 Engine,
996 };
997
998 lazy_static::lazy_static! {
999 static ref AD_ENGINE: Engine = {
1000 let mut filter_set = FilterSet::new(false);
1001 let mut rules = ParseOptions::default();
1002 rules.rule_types = RuleTypes::All;
1003
1004 filter_set.add_filters(
1005 &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1006 rules,
1007 );
1008
1009 Engine::from_filter_set(filter_set, true)
1010 };
1011 };
1012
1013 let blockable = ResourceType::Image == event.resource_type
1014 || event.resource_type == ResourceType::Media
1015 || event.resource_type == ResourceType::Stylesheet
1016 || event.resource_type == ResourceType::Document
1017 || event.resource_type == ResourceType::Fetch
1018 || event.resource_type == ResourceType::Xhr;
1019
1020 let u = &event.request.url;
1021
1022 let block_request = blockable
1023 && {
1025 let request = adblock::request::Request::preparsed(
1026 &u,
1027 "example.com",
1028 "example.com",
1029 &event.resource_type.as_ref().to_lowercase(),
1030 !event.request.is_same_site.unwrap_or_default());
1031
1032 AD_ENGINE.check_network_request(&request).matched
1033 };
1034
1035 block_request
1036 }
1037
1038 pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1039 let response = if self
1040 .attempted_authentications
1041 .contains(event.request_id.as_ref())
1042 {
1043 AuthChallengeResponseResponse::CancelAuth
1044 } else if self.credentials.is_some() {
1045 self.attempted_authentications
1046 .insert(event.request_id.clone().into());
1047 AuthChallengeResponseResponse::ProvideCredentials
1048 } else {
1049 AuthChallengeResponseResponse::Default
1050 };
1051
1052 let mut auth = AuthChallengeResponse::new(response);
1053 if let Some(creds) = self.credentials.clone() {
1054 auth.username = Some(creds.username);
1055 auth.password = Some(creds.password);
1056 }
1057 self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1058 }
1059
1060 pub fn set_offline_mode(&mut self, value: bool) {
1062 if self.offline == value {
1063 return;
1064 }
1065 self.offline = value;
1066 if let Ok(network) = EmulateNetworkConditionsParams::builder()
1067 .offline(self.offline)
1068 .latency(0)
1069 .download_throughput(-1.)
1070 .upload_throughput(-1.)
1071 .build()
1072 {
1073 self.push_cdp_request(network);
1074 }
1075 }
1076
1077 pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1079 if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1080 if let Some(interception_id) = self
1081 .request_id_to_interception_id
1082 .remove(event.request_id.as_ref())
1083 {
1084 self.on_request(event, Some(interception_id));
1085 } else {
1086 self.requests_will_be_sent
1088 .insert(event.request_id.clone(), event.clone());
1089 }
1090 } else {
1091 self.on_request(event, None);
1092 }
1093 }
1094
1095 pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1097 if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1098 request.from_memory_cache = true;
1099 }
1100 }
1101
1102 pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1104 let mut request_failed = false;
1105
1106 let mut deducted: u64 = 0;
1108
1109 if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1110 let before = *max_bytes;
1111
1112 let received_bytes: u64 = event.response.encoded_data_length as u64;
1114
1115 let content_length: Option<u64> = event
1117 .response
1118 .headers
1119 .inner()
1120 .get("content-length")
1121 .and_then(|v| v.as_str())
1122 .and_then(|s| s.trim().parse::<u64>().ok());
1123
1124 *max_bytes = max_bytes.saturating_sub(received_bytes);
1126
1127 if let Some(cl) = content_length {
1129 if cl > *max_bytes {
1130 *max_bytes = 0;
1131 }
1132 }
1133
1134 request_failed = *max_bytes == 0;
1135
1136 deducted = before.saturating_sub(*max_bytes);
1138 }
1139
1140 if deducted > 0 {
1142 self.queued_events
1143 .push_back(NetworkEvent::BytesConsumed(deducted));
1144 }
1145
1146 if request_failed && self.max_bytes_allowed.is_some() {
1148 self.set_block_all(true);
1149 }
1150
1151 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1152 request.set_response(event.response.clone());
1153 self.queued_events.push_back(if request_failed {
1154 NetworkEvent::RequestFailed(request)
1155 } else {
1156 NetworkEvent::RequestFinished(request)
1157 });
1158 }
1159 }
1160
1161 pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1163 if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1164 if let Some(interception_id) = request.interception_id.as_ref() {
1165 self.attempted_authentications
1166 .remove(interception_id.as_ref());
1167 }
1168 self.queued_events
1169 .push_back(NetworkEvent::RequestFinished(request));
1170 }
1171 }
1172
1173 pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1175 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1176 request.failure_text = Some(event.error_text.clone());
1177 if let Some(interception_id) = request.interception_id.as_ref() {
1178 self.attempted_authentications
1179 .remove(interception_id.as_ref());
1180 }
1181 self.queued_events
1182 .push_back(NetworkEvent::RequestFailed(request));
1183 }
1184 }
1185
1186 fn on_request(
1188 &mut self,
1189 event: &EventRequestWillBeSent,
1190 interception_id: Option<InterceptionId>,
1191 ) {
1192 let mut redirect_chain = Vec::new();
1193 let mut redirect_location = None;
1194
1195 if let Some(redirect_resp) = &event.redirect_response {
1196 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1197 if is_redirect_status(redirect_resp.status) {
1198 if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1199 if redirect_resp.url != location {
1200 let fixed_location = location.replace(&redirect_resp.url, "");
1201
1202 if !fixed_location.is_empty() {
1203 request.response.as_mut().map(|resp| {
1204 resp.headers.0["Location"] =
1205 serde_json::Value::String(fixed_location.clone());
1206 });
1207 }
1208
1209 redirect_location = Some(fixed_location);
1210 }
1211 }
1212 }
1213
1214 self.handle_request_redirect(
1215 &mut request,
1216 if let Some(redirect_location) = redirect_location {
1217 let mut redirect_resp = redirect_resp.clone();
1218
1219 if !redirect_location.is_empty() {
1220 redirect_resp.headers.0["Location"] =
1221 serde_json::Value::String(redirect_location);
1222 }
1223
1224 redirect_resp
1225 } else {
1226 redirect_resp.clone()
1227 },
1228 );
1229
1230 redirect_chain = std::mem::take(&mut request.redirect_chain);
1231 redirect_chain.push(request);
1232 }
1233 }
1234
1235 let request = HttpRequest::new(
1236 event.request_id.clone(),
1237 event.frame_id.clone(),
1238 interception_id,
1239 self.user_request_interception_enabled,
1240 redirect_chain,
1241 );
1242
1243 self.requests.insert(event.request_id.clone(), request);
1244 self.queued_events
1245 .push_back(NetworkEvent::Request(event.request_id.clone()));
1246 }
1247
1248 fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1250 request.set_response(response);
1251 if let Some(interception_id) = request.interception_id.as_ref() {
1252 self.attempted_authentications
1253 .remove(interception_id.as_ref());
1254 }
1255 }
1256}
1257
1258#[derive(Debug)]
1259pub enum NetworkEvent {
1260 SendCdpRequest((MethodId, serde_json::Value)),
1262 Request(RequestId),
1264 Response(RequestId),
1266 RequestFailed(HttpRequest),
1268 RequestFinished(HttpRequest),
1270 BytesConsumed(u64),
1272}
1273
1274#[cfg(test)]
1275mod tests {
1276 use super::ALLOWED_MATCHER_3RD_PARTY;
1277
1278 #[test]
1279 fn test_allowed_matcher_3rd_party() {
1280 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1282 assert!(
1283 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1284 "expected Cloudflare challenge script to be allowed"
1285 );
1286
1287 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1289 assert!(
1290 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1291 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1292 );
1293
1294 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1296 assert!(ALLOWED_MATCHER_3RD_PARTY
1297 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1298 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1299 }
1300}