1use super::blockers::{
2 block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
3 xhr::IGNORE_XHR_ASSETS,
4};
5use crate::auth::Credentials;
6#[cfg(feature = "_cache")]
7use crate::cache::BasicCachePolicy;
8use crate::cmd::CommandChain;
9use crate::handler::http::HttpRequest;
10use crate::handler::network_utils::{
11 base_domain_from_any, base_domain_from_host, first_label, host_and_rest,
12 host_contains_label_icase, host_is_subdomain_of,
13};
14use aho_corasick::AhoCorasick;
15use case_insensitive_string::CaseInsensitiveString;
16use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
17use chromiumoxide_cdp::cdp::browser_protocol::network::{
18 EmulateNetworkConditionsParams, EventLoadingFailed, EventLoadingFinished,
19 EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
20 InterceptionId, RequestId, ResourceType, Response, SetCacheDisabledParams,
21 SetExtraHttpHeadersParams,
22};
23use chromiumoxide_cdp::cdp::browser_protocol::{
24 fetch::{
25 self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
26 ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
27 },
28 network::SetBypassServiceWorkerParams,
29};
30use chromiumoxide_cdp::cdp::browser_protocol::{
31 network::EnableParams, security::SetIgnoreCertificateErrorsParams,
32};
33use chromiumoxide_types::{Command, Method, MethodId};
34use hashbrown::{HashMap, HashSet};
35use lazy_static::lazy_static;
36use reqwest::header::PROXY_AUTHORIZATION;
37use spider_network_blocker::intercept_manager::NetworkInterceptManager;
38pub use spider_network_blocker::scripts::{
39 URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
40};
41use std::borrow::Cow;
42use std::collections::VecDeque;
43use std::time::Duration;
44
45lazy_static! {
46 static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
48 "jquery", "angular",
50 "react", "vue", "bootstrap",
53 "d3",
54 "lodash",
55 "ajax",
56 "application",
57 "app", "main",
59 "index",
60 "bundle",
61 "vendor",
62 "runtime",
63 "polyfill",
64 "scripts",
65 "es2015.",
66 "es2020.",
67 "webpack",
68 "/cdn-cgi/challenge-platform/",
69 "/wp-content/js/", "https://m.stripe.network/",
72 "https://challenges.cloudflare.com/",
73 "https://www.google.com/recaptcha/enterprise.js",
74 "https://www.google.com/recaptcha/api.js",
75 "https://google.com/recaptcha/api.js",
76 "https://captcha.px-cloud.net/",
77 "https://cdn.auth0.com/js/lock/",
78 "https://cdn.auth0.com/client",
79 "https://js.stripe.com/",
80 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-"
83 ];
84
85 pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
87
88 static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
90 "https://m.stripe.network/",
92 "https://challenges.cloudflare.com/",
93 "https://www.google.com/recaptcha/api.js",
94 "https://google.com/recaptcha/api.js",
95 "https://www.google.com/recaptcha/enterprise.js",
96 "https://js.stripe.com/",
97 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-",
100 "https://ct.captcha-delivery.com/",
101 "https://geo.captcha-delivery.com/captcha/",
102 "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", "https://ct.captcha-delivery.com/",
104 "https://cdn.auth0.com/client",
105 "https://captcha.px-cloud.net/",
106 "https://www.gstatic.com/recaptcha/",
107 "https://www.google.com/recaptcha/api2/",
108 "https://www.recaptcha.net/recaptcha/",
109 "https://www.recaptcha.net/recaptcha/api2/",
110 "https://js.hcaptcha.com/1/api.js",
111 "https://hcaptcha.com/1/api.js",
112 "https://js.datadome.co/tags.js",
113 "https://api-js.datadome.co/",
114 "https://client.perimeterx.net/",
115 "https://captcha.px-cdn.net/",
116 "https://captcha.px-cloud.net/",
117 "https://s.perimeterx.net/",
118 "https://client-api.arkoselabs.com/v2/",
119 "https://static.geetest.com/v4/gt4.js",
120 "https://static.geetest.com/",
121 "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
122 "https://cdn.perfdrive.com/aperture/",
123 "https://assets.queue-it.net/",
124 "/cdn-cgi/challenge-platform/",
125 "/_Incapsula_Resource",
126 ];
127
128 pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
130
131 pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
133 phf::phf_set! {
134 "_astro/", "_app/immutable"
136 }
137 };
138
139 pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
141 "application/pdf",
142 "application/zip",
143 "application/x-rar-compressed",
144 "application/x-tar",
145 "image/png",
146 "image/jpeg",
147 "image/gif",
148 "image/bmp",
149 "image/webp",
150 "image/svg+xml",
151 "video/mp4",
152 "video/x-msvideo",
153 "video/x-matroska",
154 "video/webm",
155 "audio/mpeg",
156 "audio/ogg",
157 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
158 "application/vnd.ms-excel",
159 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
160 "application/vnd.ms-powerpoint",
161 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
162 "application/x-7z-compressed",
163 "application/x-rpm",
164 "application/x-shockwave-flash",
165 "application/rtf",
166 };
167
168 pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
170 "Image",
171 "Media",
172 "Font"
173 };
174
175 pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
177 "CspViolationReport",
178 "Manifest",
179 "Other",
180 "Prefetch",
181 "Ping",
182 };
183
184 pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
186
187 pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
189 let enable = EnableParams::default();
190
191 if let Ok(c) = serde_json::to_value(&enable) {
192 vec![(enable.identifier(), c)]
193 } else {
194 vec![]
195 }
196 };
197
198 pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
200 let enable = EnableParams::default();
201 let mut v = vec![];
202 if let Ok(c) = serde_json::to_value(&enable) {
203 v.push((enable.identifier(), c));
204 }
205 let ignore = SetIgnoreCertificateErrorsParams::new(true);
206 if let Ok(ignored) = serde_json::to_value(&ignore) {
207 v.push((ignore.identifier(), ignored));
208 }
209
210 v
211 };
212
213 pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
215 fetch::EnableParams::builder()
216 .handle_auth_requests(true)
217 .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
218 .build()
219 };
220}
221
222pub(crate) fn is_redirect_status(status: i64) -> bool {
224 matches!(status, 301 | 302 | 303 | 307 | 308)
225}
226
227#[derive(Debug)]
228pub struct NetworkManager {
230 queued_events: VecDeque<NetworkEvent>,
231 ignore_httpserrors: bool,
232 requests: HashMap<RequestId, HttpRequest>,
233 requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
235 extra_headers: std::collections::HashMap<String, String>,
236 request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
237 user_cache_disabled: bool,
238 attempted_authentications: HashSet<RequestId>,
239 credentials: Option<Credentials>,
240 pub(crate) user_request_interception_enabled: bool,
241 block_all: bool,
242 pub(crate) protocol_request_interception_enabled: bool,
243 offline: bool,
245 pub request_timeout: Duration,
247 pub ignore_visuals: bool,
250 pub block_stylesheets: bool,
252 pub block_javascript: bool,
254 pub block_analytics: bool,
256 pub only_html: bool,
258 pub xml_document: bool,
260 pub intercept_manager: NetworkInterceptManager,
262 pub document_reload_tracker: u8,
264 pub document_target_url: String,
266 pub document_target_domain: String,
268 pub max_bytes_allowed: Option<u64>,
270 #[cfg(feature = "_cache")]
271 pub cache_site_key: Option<String>,
273 #[cfg(feature = "_cache")]
275 pub cache_policy: Option<BasicCachePolicy>,
276 whitelist_patterns: Vec<String>,
278 whitelist_matcher: Option<AhoCorasick>,
280}
281
282impl NetworkManager {
283 pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
285 Self {
286 queued_events: Default::default(),
287 ignore_httpserrors,
288 requests: Default::default(),
289 requests_will_be_sent: Default::default(),
290 extra_headers: Default::default(),
291 request_id_to_interception_id: Default::default(),
292 user_cache_disabled: false,
293 attempted_authentications: Default::default(),
294 credentials: None,
295 block_all: false,
296 user_request_interception_enabled: false,
297 protocol_request_interception_enabled: false,
298 offline: false,
299 request_timeout,
300 ignore_visuals: false,
301 block_javascript: false,
302 block_stylesheets: false,
303 block_analytics: true,
304 only_html: false,
305 xml_document: false,
306 intercept_manager: NetworkInterceptManager::Unknown,
307 document_reload_tracker: 0,
308 document_target_url: String::new(),
309 document_target_domain: String::new(),
310 whitelist_patterns: Vec::new(),
311 whitelist_matcher: None,
312 max_bytes_allowed: None,
313 #[cfg(feature = "_cache")]
314 cache_site_key: None,
315 #[cfg(feature = "_cache")]
316 cache_policy: None,
317 }
318 }
319
320 pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
322 where
323 I: IntoIterator<Item = S>,
324 S: Into<String>,
325 {
326 self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
327 self.rebuild_whitelist_matcher();
328 }
329
330 pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
332 self.whitelist_patterns.push(pattern.into());
333 self.rebuild_whitelist_matcher();
334 }
335
336 pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
338 where
339 I: IntoIterator<Item = S>,
340 S: Into<String>,
341 {
342 self.whitelist_patterns
343 .extend(patterns.into_iter().map(Into::into));
344 self.rebuild_whitelist_matcher();
345 }
346
347 #[inline]
348 fn rebuild_whitelist_matcher(&mut self) {
349 if self.whitelist_patterns.is_empty() {
350 self.whitelist_matcher = None;
351 return;
352 }
353
354 let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
355
356 self.whitelist_matcher = AhoCorasick::new(refs).ok();
358 }
359
360 #[inline]
361 fn is_whitelisted(&self, url: &str) -> bool {
362 self.whitelist_matcher
363 .as_ref()
364 .map(|m| m.is_match(url))
365 .unwrap_or(false)
366 }
367
368 pub fn init_commands(&self) -> CommandChain {
370 let cmds = if self.ignore_httpserrors {
371 INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
372 } else {
373 INIT_CHAIN.clone()
374 };
375 CommandChain::new(cmds, self.request_timeout)
376 }
377
378 pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
380 let method = cmd.identifier();
381 if let Ok(params) = serde_json::to_value(cmd) {
382 self.queued_events
383 .push_back(NetworkEvent::SendCdpRequest((method, params)));
384 }
385 }
386
387 pub fn poll(&mut self) -> Option<NetworkEvent> {
389 self.queued_events.pop_front()
390 }
391
392 pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
394 &self.extra_headers
395 }
396
397 pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
399 self.extra_headers = headers;
400 self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
401 self.extra_headers.remove("Proxy-Authorization");
402 if !self.extra_headers.is_empty() {
403 if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
404 self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
405 }
406 }
407 }
408
409 pub fn set_service_worker_enabled(&mut self, bypass: bool) {
410 self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
411 }
412
413 pub fn set_block_all(&mut self, block_all: bool) {
414 self.block_all = block_all;
415 }
416
417 pub fn set_request_interception(&mut self, enabled: bool) {
418 self.user_request_interception_enabled = enabled;
419 self.update_protocol_request_interception();
420 }
421
422 pub fn set_cache_enabled(&mut self, enabled: bool) {
423 let run = self.user_cache_disabled != !enabled;
424 self.user_cache_disabled = !enabled;
425 if run {
426 self.update_protocol_cache_disabled();
427 }
428 }
429
430 pub fn enable_request_intercept(&mut self) {
432 self.protocol_request_interception_enabled = true;
433 }
434
435 pub fn disable_request_intercept(&mut self) {
437 self.protocol_request_interception_enabled = false;
438 }
439
440 #[cfg(feature = "_cache")]
442 pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
443 self.cache_site_key = cache_site_key;
444 }
445
446 #[cfg(feature = "_cache")]
448 pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
449 self.cache_policy = cache_policy;
450 }
451
452 pub fn update_protocol_cache_disabled(&mut self) {
453 self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
454 }
455
456 pub fn authenticate(&mut self, credentials: Credentials) {
457 self.credentials = Some(credentials);
458 self.update_protocol_request_interception();
459 self.protocol_request_interception_enabled = true;
460 }
461
462 fn update_protocol_request_interception(&mut self) {
463 let enabled = self.user_request_interception_enabled || self.credentials.is_some();
464
465 if enabled == self.protocol_request_interception_enabled {
466 return;
467 }
468
469 if enabled {
470 self.push_cdp_request(ENABLE_FETCH.clone())
471 } else {
472 self.push_cdp_request(DisableParams::default())
473 }
474 }
475
476 #[inline]
477 fn rel_for_ignore_script<'a>(&self, url: &'a str) -> Cow<'a, str> {
478 if url.starts_with('/') {
479 return Cow::Borrowed(url);
480 }
481
482 let base_raw = self.document_target_domain.as_str();
483
484 if base_raw.is_empty() {
485 return Cow::Borrowed(url);
486 }
487
488 let base = base_domain_from_any(base_raw).trim_end_matches('.');
489 if base.is_empty() {
490 return Cow::Borrowed(url);
491 }
492
493 let brand = first_label(base);
494
495 if let Some((host, rest)) = host_and_rest(url) {
496 if host_is_subdomain_of(host, base) || host_contains_label_icase(host, brand) {
497 return if rest.starts_with('/') {
498 Cow::Borrowed(rest)
499 } else {
500 Cow::Borrowed("/")
501 };
502 }
503 }
504
505 Cow::Borrowed(url)
506 }
507
508 #[inline]
510 pub(crate) fn ignore_script(
511 &self,
512 url: &str,
513 block_analytics: bool,
514 intercept_manager: NetworkInterceptManager,
515 ) -> bool {
516 let mut ignore_script = !url.starts_with("/");
518
519 if !ignore_script
520 && block_analytics
521 && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
522 {
523 ignore_script = true;
524 }
525
526 if !ignore_script {
527 if let Some(index) = url.find("//") {
528 let pos = index + 2;
529
530 if pos < url.len() {
532 if let Some(slash_index) = url[pos..].find('/') {
534 let base_path_index = pos + slash_index + 1;
535
536 if url.len() > base_path_index {
537 let new_url: &str = &url[base_path_index..];
538
539 if !ignore_script
541 && intercept_manager == NetworkInterceptManager::Unknown
542 {
543 let hydration_file =
544 JS_FRAMEWORK_PATH.iter().any(|p| new_url.starts_with(p));
545
546 if hydration_file && new_url.ends_with(".js") {
548 ignore_script = true;
549 }
550 }
551
552 if !ignore_script
553 && URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(new_url)
554 {
555 ignore_script = true;
556 }
557
558 if !ignore_script
559 && self.ignore_visuals
560 && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(new_url)
561 {
562 ignore_script = true;
563 }
564 }
565 }
566 }
567 }
568 }
569
570 if !ignore_script && block_analytics {
572 ignore_script = URL_IGNORE_TRIE_PATHS.contains_prefix(url);
573 }
574
575 ignore_script
576 }
577
578 #[inline]
580 fn skip_xhr(
581 &self,
582 skip_networking: bool,
583 event: &EventRequestPaused,
584 network_event: bool,
585 ) -> bool {
586 if !skip_networking && network_event {
588 let request_url = event.request.url.as_str();
589
590 let skip_analytics =
592 self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
593
594 if skip_analytics {
595 true
596 } else if self.block_stylesheets || self.ignore_visuals {
597 let block_css = self.block_stylesheets;
598 let block_media = self.ignore_visuals;
599
600 let mut block_request = false;
601
602 if let Some(position) = request_url.rfind('.') {
603 let hlen = request_url.len();
604 let has_asset = hlen - position;
605
606 if has_asset >= 3 {
607 let next_position = position + 1;
608
609 if block_media
610 && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
611 &request_url[next_position..].into(),
612 )
613 {
614 block_request = true;
615 } else if block_css {
616 block_request =
617 CaseInsensitiveString::from(request_url[next_position..].as_bytes())
618 .contains(&**CSS_EXTENSION)
619 }
620 }
621 }
622
623 if !block_request {
624 block_request = ignore_script_xhr_media(request_url);
625 }
626
627 block_request
628 } else {
629 skip_networking
630 }
631 } else {
632 skip_networking
633 }
634 }
635
636 #[cfg(feature = "adblock")]
637 #[inline]
638 fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
640 if skip_networking {
641 true
642 } else {
643 self.detect_ad(event)
644 }
645 }
646
647 #[cfg(not(feature = "adblock"))]
649 #[inline]
650 fn detect_ad_if_enabled(&mut self, _event: &EventRequestPaused, skip_networking: bool) -> bool {
651 skip_networking
652 }
653
654 #[inline]
655 fn fail_request_blocked(
657 &mut self,
658 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
659 ) {
660 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
661 request_id.clone(),
662 chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
663 );
664 self.push_cdp_request(params);
665 }
666
667 #[inline]
668 fn fulfill_request_empty_200(
670 &mut self,
671 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
672 ) {
673 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
674 request_id.clone(),
675 200,
676 );
677 self.push_cdp_request(params);
678 }
679
680 #[cfg(feature = "_cache")]
681 #[inline]
682 fn fulfill_request_from_cache(
686 &mut self,
687 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
688 body: &[u8],
689 headers: &std::collections::HashMap<String, String>,
690 status: i64,
691 ) {
692 use crate::cdp::browser_protocol::fetch::HeaderEntry;
693 use crate::handler::network::fetch::FulfillRequestParams;
694 use base64::Engine;
695
696 let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
697
698 for (k, v) in headers.iter() {
699 resp_headers.push(HeaderEntry {
700 name: k.clone().into(),
701 value: v.clone().into(),
702 });
703 }
704
705 let mut params = FulfillRequestParams::new(request_id.clone(), status);
706
707 params.body = Some(
709 base64::engine::general_purpose::STANDARD
710 .encode(body)
711 .into(),
712 );
713
714 params.response_headers = Some(resp_headers);
715
716 self.push_cdp_request(params);
717 }
718
719 #[inline]
720 fn continue_request_with_url(
722 &mut self,
723 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
724 url: Option<&str>,
725 intercept_response: bool,
726 ) {
727 let mut params = ContinueRequestParams::new(request_id.clone());
728 if let Some(url) = url {
729 params.url = Some(url.to_string());
730 params.intercept_response = Some(intercept_response);
731 }
732 self.push_cdp_request(params);
733 }
734
735 #[inline]
737 pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
738 if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
739 return;
740 }
741
742 let resource_type = &event.resource_type;
743
744 if self.block_all {
745 tracing::debug!(
746 "Blocked (block_all): {:?} - {}",
747 event.resource_type,
748 event.request.url
749 );
750 return self.fail_request_blocked(&event.request_id);
751 }
752
753 if let Some(network_id) = event.network_id.as_ref() {
759 if let Some(request_will_be_sent) =
760 self.requests_will_be_sent.remove(network_id.as_ref())
761 {
762 self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
763 } else {
764 self.request_id_to_interception_id
765 .insert(network_id.clone(), event.request_id.clone().into());
766 }
767 }
768
769 let javascript_resource = *resource_type == ResourceType::Script;
771 let document_resource = *resource_type == ResourceType::Document;
772 let network_resource = !document_resource && crate::utils::is_data_resource(resource_type);
773
774 let mut skip_networking =
776 self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(resource_type.as_ref());
777
778 if !skip_networking {
780 skip_networking = self.document_reload_tracker >= 3;
781 }
782
783 let (current_url_cow, had_replacer) =
785 self.handle_document_replacement_and_tracking(event, document_resource);
786
787 let current_url: &str = current_url_cow.as_ref();
788
789 if !skip_networking {
791 if self.xml_document && current_url.ends_with(".xsl") {
793 skip_networking = false;
794 } else {
795 skip_networking = self.should_skip_for_visuals_and_basic_js(
796 resource_type,
797 javascript_resource,
798 current_url,
799 );
800 }
801 }
802
803 skip_networking = self.detect_ad_if_enabled(event, skip_networking);
805
806 if !skip_networking
808 && (self.only_html || self.ignore_visuals)
809 && (javascript_resource || document_resource)
810 {
811 skip_networking = ignore_script_embedded(current_url);
812 }
813
814 if skip_networking && javascript_resource {
816 let rel = self.rel_for_ignore_script(current_url);
817 skip_networking =
818 self.ignore_script(rel.as_ref(), self.block_analytics, self.intercept_manager);
819 }
820
821 skip_networking = self.skip_xhr(skip_networking, event, network_resource);
823
824 if !skip_networking && (javascript_resource || network_resource || document_resource) {
826 skip_networking = self.intercept_manager.intercept_detection(
827 current_url,
828 self.ignore_visuals,
829 network_resource,
830 );
831 }
832
833 if !skip_networking && (javascript_resource || network_resource) {
835 skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
836 }
837
838 if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
840 {
841 skip_networking = false;
842 }
843
844 if skip_networking && self.is_whitelisted(current_url) {
846 skip_networking = false;
847 }
848
849 if skip_networking {
850 tracing::debug!("Blocked: {:?} - {}", resource_type, current_url);
851 self.fulfill_request_empty_200(&event.request_id);
852 } else {
853 #[cfg(feature = "_cache")]
854 {
855 if let (Some(policy), Some(cache_site_key)) =
856 (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
857 {
858 let current_url = format!("{}:{}", event.request.method, ¤t_url);
859
860 if let Some((res, cache_policy)) =
861 crate::cache::remote::get_session_cache_item(cache_site_key, ¤t_url)
862 {
863 if policy.allows_cached(&cache_policy) {
864 tracing::debug!(
865 "Remote Cached: {:?} - {}",
866 resource_type,
867 ¤t_url
868 );
869 return self.fulfill_request_from_cache(
870 &event.request_id,
871 &res.body,
872 &res.headers,
873 res.status as i64,
874 );
875 }
876 }
877 }
878 }
879
880 tracing::debug!("Allowed: {:?} - {}", resource_type, current_url);
882 self.continue_request_with_url(
883 &event.request_id,
884 if had_replacer {
885 Some(current_url)
886 } else {
887 None
888 },
889 !had_replacer,
890 );
891 }
892 }
893
894 pub fn has_target_domain(&self) -> bool {
896 !self.document_target_url.is_empty()
897 }
898
899 pub fn set_page_url(&mut self, page_target_url: String) {
901 let host_base = host_and_rest(&page_target_url)
902 .map(|(h, _)| base_domain_from_host(h))
903 .unwrap_or("");
904
905 self.document_target_domain = host_base.to_string();
906 self.document_target_url = page_target_url;
907 }
908
909 pub fn clear_target_domain(&mut self) {
911 self.document_reload_tracker = 0;
912 self.document_target_url = Default::default();
913 self.document_target_domain = Default::default();
914 }
915 #[inline]
923 fn handle_document_replacement_and_tracking<'a>(
924 &mut self,
925 event: &'a EventRequestPaused,
926 document_resource: bool,
927 ) -> (Cow<'a, str>, bool) {
928 let mut replacer: Option<String> = None;
929 let current_url = event.request.url.as_str();
930
931 if document_resource {
932 if self.document_target_url == current_url {
933 self.document_reload_tracker += 1;
934 } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
935 {
936 let (http_document_replacement, mut https_document_replacement) =
937 if self.document_target_url.starts_with("http://") {
938 (
939 self.document_target_url.replacen("http://", "http//", 1),
940 self.document_target_url.replacen("http://", "https://", 1),
941 )
942 } else {
943 (
944 self.document_target_url.replacen("https://", "https//", 1),
945 self.document_target_url.replacen("https://", "http://", 1),
946 )
947 };
948
949 let trailing = https_document_replacement.ends_with('/');
951 if trailing {
952 https_document_replacement.pop();
953 }
954 if https_document_replacement.ends_with('/') {
955 https_document_replacement.pop();
956 }
957
958 let redirect_mask = format!(
959 "{}{}",
960 https_document_replacement, http_document_replacement
961 );
962
963 if current_url == redirect_mask {
964 replacer = Some(if trailing {
965 format!("{}/", https_document_replacement)
966 } else {
967 https_document_replacement
968 });
969 }
970 }
971
972 if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
973 self.xml_document = true;
974 }
975
976 self.document_target_url = event.request.url.clone();
978 self.document_target_domain = host_and_rest(&self.document_target_url)
979 .map(|(h, _)| base_domain_from_host(h).to_string())
980 .unwrap_or_default();
981 }
982
983 let current_url_cow = match replacer {
984 Some(r) => Cow::Owned(r),
985 None => Cow::Borrowed(event.request.url.as_str()),
986 };
987
988 let had_replacer = matches!(current_url_cow, Cow::Owned(_));
989 (current_url_cow, had_replacer)
990 }
991
992 #[inline]
994 fn should_skip_for_visuals_and_basic_js(
995 &self,
996 resource_type: &ResourceType,
997 javascript_resource: bool,
998 current_url: &str,
999 ) -> bool {
1000 (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1001 || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1002 || (self.block_javascript
1003 && javascript_resource
1004 && self.intercept_manager == NetworkInterceptManager::Unknown
1005 && !ALLOWED_MATCHER.is_match(current_url))
1006 }
1007
1008 #[cfg(feature = "adblock")]
1010 pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1011 use adblock::{
1012 lists::{FilterSet, ParseOptions, RuleTypes},
1013 Engine,
1014 };
1015
1016 lazy_static::lazy_static! {
1017 static ref AD_ENGINE: Engine = {
1018 let mut filter_set = FilterSet::new(false);
1019 let mut rules = ParseOptions::default();
1020 rules.rule_types = RuleTypes::All;
1021
1022 filter_set.add_filters(
1023 &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1024 rules,
1025 );
1026
1027 Engine::from_filter_set(filter_set, true)
1028 };
1029 };
1030
1031 let blockable = ResourceType::Image == event.resource_type
1032 || event.resource_type == ResourceType::Media
1033 || event.resource_type == ResourceType::Stylesheet
1034 || event.resource_type == ResourceType::Document
1035 || event.resource_type == ResourceType::Fetch
1036 || event.resource_type == ResourceType::Xhr;
1037
1038 let u = &event.request.url;
1039
1040 let block_request = blockable
1041 && {
1043 let request = adblock::request::Request::preparsed(
1044 &u,
1045 "example.com",
1046 "example.com",
1047 &event.resource_type.as_ref().to_lowercase(),
1048 !event.request.is_same_site.unwrap_or_default());
1049
1050 AD_ENGINE.check_network_request(&request).matched
1051 };
1052
1053 block_request
1054 }
1055
1056 pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1057 let response = if self
1058 .attempted_authentications
1059 .contains(event.request_id.as_ref())
1060 {
1061 AuthChallengeResponseResponse::CancelAuth
1062 } else if self.credentials.is_some() {
1063 self.attempted_authentications
1064 .insert(event.request_id.clone().into());
1065 AuthChallengeResponseResponse::ProvideCredentials
1066 } else {
1067 AuthChallengeResponseResponse::Default
1068 };
1069
1070 let mut auth = AuthChallengeResponse::new(response);
1071 if let Some(creds) = self.credentials.clone() {
1072 auth.username = Some(creds.username);
1073 auth.password = Some(creds.password);
1074 }
1075 self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1076 }
1077
1078 pub fn set_offline_mode(&mut self, value: bool) {
1080 if self.offline == value {
1081 return;
1082 }
1083 self.offline = value;
1084 if let Ok(network) = EmulateNetworkConditionsParams::builder()
1085 .offline(self.offline)
1086 .latency(0)
1087 .download_throughput(-1.)
1088 .upload_throughput(-1.)
1089 .build()
1090 {
1091 self.push_cdp_request(network);
1092 }
1093 }
1094
1095 pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1097 if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1098 if let Some(interception_id) = self
1099 .request_id_to_interception_id
1100 .remove(event.request_id.as_ref())
1101 {
1102 self.on_request(event, Some(interception_id));
1103 } else {
1104 self.requests_will_be_sent
1106 .insert(event.request_id.clone(), event.clone());
1107 }
1108 } else {
1109 self.on_request(event, None);
1110 }
1111 }
1112
1113 pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1115 if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1116 request.from_memory_cache = true;
1117 }
1118 }
1119
1120 pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1122 let mut request_failed = false;
1123
1124 let mut deducted: u64 = 0;
1126
1127 if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1128 let before = *max_bytes;
1129
1130 let received_bytes: u64 = event.response.encoded_data_length as u64;
1132
1133 let content_length: Option<u64> = event
1135 .response
1136 .headers
1137 .inner()
1138 .get("content-length")
1139 .and_then(|v| v.as_str())
1140 .and_then(|s| s.trim().parse::<u64>().ok());
1141
1142 *max_bytes = max_bytes.saturating_sub(received_bytes);
1144
1145 if let Some(cl) = content_length {
1147 if cl > *max_bytes {
1148 *max_bytes = 0;
1149 }
1150 }
1151
1152 request_failed = *max_bytes == 0;
1153
1154 deducted = before.saturating_sub(*max_bytes);
1156 }
1157
1158 if deducted > 0 {
1160 self.queued_events
1161 .push_back(NetworkEvent::BytesConsumed(deducted));
1162 }
1163
1164 if request_failed && self.max_bytes_allowed.is_some() {
1166 self.set_block_all(true);
1167 }
1168
1169 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1170 request.set_response(event.response.clone());
1171 self.queued_events.push_back(if request_failed {
1172 NetworkEvent::RequestFailed(request)
1173 } else {
1174 NetworkEvent::RequestFinished(request)
1175 });
1176 }
1177 }
1178
1179 pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1181 if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1182 if let Some(interception_id) = request.interception_id.as_ref() {
1183 self.attempted_authentications
1184 .remove(interception_id.as_ref());
1185 }
1186 self.queued_events
1187 .push_back(NetworkEvent::RequestFinished(request));
1188 }
1189 }
1190
1191 pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1193 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1194 request.failure_text = Some(event.error_text.clone());
1195 if let Some(interception_id) = request.interception_id.as_ref() {
1196 self.attempted_authentications
1197 .remove(interception_id.as_ref());
1198 }
1199 self.queued_events
1200 .push_back(NetworkEvent::RequestFailed(request));
1201 }
1202 }
1203
1204 fn on_request(
1206 &mut self,
1207 event: &EventRequestWillBeSent,
1208 interception_id: Option<InterceptionId>,
1209 ) {
1210 let mut redirect_chain = Vec::new();
1211 let mut redirect_location = None;
1212
1213 if let Some(redirect_resp) = &event.redirect_response {
1214 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1215 if is_redirect_status(redirect_resp.status) {
1216 if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1217 if redirect_resp.url != location {
1218 let fixed_location = location.replace(&redirect_resp.url, "");
1219
1220 if !fixed_location.is_empty() {
1221 request.response.as_mut().map(|resp| {
1222 resp.headers.0["Location"] =
1223 serde_json::Value::String(fixed_location.clone());
1224 });
1225 }
1226
1227 redirect_location = Some(fixed_location);
1228 }
1229 }
1230 }
1231
1232 self.handle_request_redirect(
1233 &mut request,
1234 if let Some(redirect_location) = redirect_location {
1235 let mut redirect_resp = redirect_resp.clone();
1236
1237 if !redirect_location.is_empty() {
1238 redirect_resp.headers.0["Location"] =
1239 serde_json::Value::String(redirect_location);
1240 }
1241
1242 redirect_resp
1243 } else {
1244 redirect_resp.clone()
1245 },
1246 );
1247
1248 redirect_chain = std::mem::take(&mut request.redirect_chain);
1249 redirect_chain.push(request);
1250 }
1251 }
1252
1253 let request = HttpRequest::new(
1254 event.request_id.clone(),
1255 event.frame_id.clone(),
1256 interception_id,
1257 self.user_request_interception_enabled,
1258 redirect_chain,
1259 );
1260
1261 self.requests.insert(event.request_id.clone(), request);
1262 self.queued_events
1263 .push_back(NetworkEvent::Request(event.request_id.clone()));
1264 }
1265
1266 fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1268 request.set_response(response);
1269 if let Some(interception_id) = request.interception_id.as_ref() {
1270 self.attempted_authentications
1271 .remove(interception_id.as_ref());
1272 }
1273 }
1274}
1275
1276#[derive(Debug)]
1277pub enum NetworkEvent {
1278 SendCdpRequest((MethodId, serde_json::Value)),
1280 Request(RequestId),
1282 Response(RequestId),
1284 RequestFailed(HttpRequest),
1286 RequestFinished(HttpRequest),
1288 BytesConsumed(u64),
1290}
1291
1292#[cfg(test)]
1293mod tests {
1294 use super::ALLOWED_MATCHER_3RD_PARTY;
1295
1296 #[test]
1297 fn test_allowed_matcher_3rd_party() {
1298 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1300 assert!(
1301 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1302 "expected Cloudflare challenge script to be allowed"
1303 );
1304
1305 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1307 assert!(
1308 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1309 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1310 );
1311
1312 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1314 assert!(ALLOWED_MATCHER_3RD_PARTY
1315 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1316 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1317 }
1318}