1use super::blockers::{
2 block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
3 xhr::IGNORE_XHR_ASSETS,
4};
5use crate::auth::Credentials;
6#[cfg(feature = "_cache")]
7use crate::cache::BasicCachePolicy;
8use crate::cmd::CommandChain;
9use crate::handler::http::HttpRequest;
10use crate::handler::network_utils::{
11 base_domain_from_any, base_domain_from_host, first_label, host_and_rest,
12 host_contains_label_icase, host_is_subdomain_of,
13};
14use aho_corasick::AhoCorasick;
15use case_insensitive_string::CaseInsensitiveString;
16use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
17use chromiumoxide_cdp::cdp::browser_protocol::network::{
18 EmulateNetworkConditionsParams, EventLoadingFailed, EventLoadingFinished,
19 EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
20 InterceptionId, RequestId, ResourceType, Response, SetCacheDisabledParams,
21 SetExtraHttpHeadersParams,
22};
23use chromiumoxide_cdp::cdp::browser_protocol::{
24 fetch::{
25 self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
26 ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
27 },
28 network::SetBypassServiceWorkerParams,
29};
30use chromiumoxide_cdp::cdp::browser_protocol::{
31 network::EnableParams, security::SetIgnoreCertificateErrorsParams,
32};
33use chromiumoxide_types::{Command, Method, MethodId};
34use hashbrown::{HashMap, HashSet};
35use lazy_static::lazy_static;
36use reqwest::header::PROXY_AUTHORIZATION;
37use spider_network_blocker::intercept_manager::NetworkInterceptManager;
38pub use spider_network_blocker::scripts::{
39 URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
40};
41use std::borrow::Cow;
42use std::collections::VecDeque;
43use std::time::Duration;
44
45lazy_static! {
46 static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
48 "jquery", "angular",
50 "react", "vue", "bootstrap",
53 "d3",
54 "lodash",
55 "ajax",
56 "application",
57 "app", "main",
59 "index",
60 "bundle",
61 "vendor",
62 "runtime",
63 "polyfill",
64 "scripts",
65 "es2015.",
66 "es2020.",
67 "webpack",
68 "/cdn-cgi/challenge-platform/",
69 "/wp-content/js/", "https://m.stripe.network/",
72 "https://challenges.cloudflare.com/",
73 "https://www.google.com/recaptcha/enterprise.js",
74 "https://www.google.com/recaptcha/api.js",
75 "https://google.com/recaptcha/api.js",
76 "https://captcha.px-cloud.net/",
77 "https://cdn.auth0.com/js/lock/",
78 "https://cdn.auth0.com/client",
79 "https://js.stripe.com/",
80 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-"
83 ];
84
85 pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
87
88 static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
90 "https://m.stripe.network/",
92 "https://challenges.cloudflare.com/",
93 "https://www.google.com/recaptcha/api.js",
94 "https://google.com/recaptcha/api.js",
95 "https://www.google.com/recaptcha/enterprise.js",
96 "https://js.stripe.com/",
97 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-",
100 "https://ct.captcha-delivery.com/",
101 "https://geo.captcha-delivery.com/captcha/",
102 "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", "https://ct.captcha-delivery.com/",
104 "https://cdn.auth0.com/client",
105 "https://captcha.px-cloud.net/",
106 "/cdn-cgi/challenge-platform/"
107 ];
108
109 pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
111
112 pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
114 phf::phf_set! {
115 "_astro/", "_app/immutable"
117 }
118 };
119
120 pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
122 "application/pdf",
123 "application/zip",
124 "application/x-rar-compressed",
125 "application/x-tar",
126 "image/png",
127 "image/jpeg",
128 "image/gif",
129 "image/bmp",
130 "image/webp",
131 "image/svg+xml",
132 "video/mp4",
133 "video/x-msvideo",
134 "video/x-matroska",
135 "video/webm",
136 "audio/mpeg",
137 "audio/ogg",
138 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
139 "application/vnd.ms-excel",
140 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
141 "application/vnd.ms-powerpoint",
142 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
143 "application/x-7z-compressed",
144 "application/x-rpm",
145 "application/x-shockwave-flash",
146 "application/rtf",
147 };
148
149 pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
151 "Image",
152 "Media",
153 "Font"
154 };
155
156 pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
158 "CspViolationReport",
159 "Manifest",
160 "Other",
161 "Prefetch",
162 "Ping",
163 };
164
165 pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
167
168 pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
170 let enable = EnableParams::default();
171
172 if let Ok(c) = serde_json::to_value(&enable) {
173 vec![(enable.identifier(), c)]
174 } else {
175 vec![]
176 }
177 };
178
179 pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
181 let enable = EnableParams::default();
182 let mut v = vec![];
183 if let Ok(c) = serde_json::to_value(&enable) {
184 v.push((enable.identifier(), c));
185 }
186 let ignore = SetIgnoreCertificateErrorsParams::new(true);
187 if let Ok(ignored) = serde_json::to_value(&ignore) {
188 v.push((ignore.identifier(), ignored));
189 }
190
191 v
192 };
193
194 pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
196 fetch::EnableParams::builder()
197 .handle_auth_requests(true)
198 .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
199 .build()
200 };
201}
202
203pub(crate) fn is_redirect_status(status: i64) -> bool {
205 matches!(status, 301 | 302 | 303 | 307 | 308)
206}
207
208#[derive(Debug)]
209pub struct NetworkManager {
211 queued_events: VecDeque<NetworkEvent>,
212 ignore_httpserrors: bool,
213 requests: HashMap<RequestId, HttpRequest>,
214 requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
216 extra_headers: std::collections::HashMap<String, String>,
217 request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
218 user_cache_disabled: bool,
219 attempted_authentications: HashSet<RequestId>,
220 credentials: Option<Credentials>,
221 pub(crate) user_request_interception_enabled: bool,
222 block_all: bool,
223 pub(crate) protocol_request_interception_enabled: bool,
224 offline: bool,
226 pub request_timeout: Duration,
228 pub ignore_visuals: bool,
231 pub block_stylesheets: bool,
233 pub block_javascript: bool,
235 pub block_analytics: bool,
237 pub only_html: bool,
239 pub xml_document: bool,
241 pub intercept_manager: NetworkInterceptManager,
243 pub document_reload_tracker: u8,
245 pub document_target_url: String,
247 pub document_target_domain: String,
249 pub max_bytes_allowed: Option<u64>,
251 #[cfg(feature = "_cache")]
252 pub cache_site_key: Option<String>,
254 #[cfg(feature = "_cache")]
256 pub cache_policy: Option<BasicCachePolicy>,
257 whitelist_patterns: Vec<String>,
259 whitelist_matcher: Option<AhoCorasick>,
261}
262
263impl NetworkManager {
264 pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
266 Self {
267 queued_events: Default::default(),
268 ignore_httpserrors,
269 requests: Default::default(),
270 requests_will_be_sent: Default::default(),
271 extra_headers: Default::default(),
272 request_id_to_interception_id: Default::default(),
273 user_cache_disabled: false,
274 attempted_authentications: Default::default(),
275 credentials: None,
276 block_all: false,
277 user_request_interception_enabled: false,
278 protocol_request_interception_enabled: false,
279 offline: false,
280 request_timeout,
281 ignore_visuals: false,
282 block_javascript: false,
283 block_stylesheets: false,
284 block_analytics: true,
285 only_html: false,
286 xml_document: false,
287 intercept_manager: NetworkInterceptManager::Unknown,
288 document_reload_tracker: 0,
289 document_target_url: String::new(),
290 document_target_domain: String::new(),
291 whitelist_patterns: Vec::new(),
292 whitelist_matcher: None,
293 max_bytes_allowed: None,
294 #[cfg(feature = "_cache")]
295 cache_site_key: None,
296 #[cfg(feature = "_cache")]
297 cache_policy: None,
298 }
299 }
300
301 pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
303 where
304 I: IntoIterator<Item = S>,
305 S: Into<String>,
306 {
307 self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
308 self.rebuild_whitelist_matcher();
309 }
310
311 pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
313 self.whitelist_patterns.push(pattern.into());
314 self.rebuild_whitelist_matcher();
315 }
316
317 pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
319 where
320 I: IntoIterator<Item = S>,
321 S: Into<String>,
322 {
323 self.whitelist_patterns
324 .extend(patterns.into_iter().map(Into::into));
325 self.rebuild_whitelist_matcher();
326 }
327
328 #[inline]
329 fn rebuild_whitelist_matcher(&mut self) {
330 if self.whitelist_patterns.is_empty() {
331 self.whitelist_matcher = None;
332 return;
333 }
334
335 let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
336
337 self.whitelist_matcher = AhoCorasick::new(refs).ok();
339 }
340
341 #[inline]
342 fn is_whitelisted(&self, url: &str) -> bool {
343 self.whitelist_matcher
344 .as_ref()
345 .map(|m| m.is_match(url))
346 .unwrap_or(false)
347 }
348
349 pub fn init_commands(&self) -> CommandChain {
351 let cmds = if self.ignore_httpserrors {
352 INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
353 } else {
354 INIT_CHAIN.clone()
355 };
356 CommandChain::new(cmds, self.request_timeout)
357 }
358
359 pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
361 let method = cmd.identifier();
362 if let Ok(params) = serde_json::to_value(cmd) {
363 self.queued_events
364 .push_back(NetworkEvent::SendCdpRequest((method, params)));
365 }
366 }
367
368 pub fn poll(&mut self) -> Option<NetworkEvent> {
370 self.queued_events.pop_front()
371 }
372
373 pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
375 &self.extra_headers
376 }
377
378 pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
380 self.extra_headers = headers;
381 self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
382 self.extra_headers.remove("Proxy-Authorization");
383 if !self.extra_headers.is_empty() {
384 if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
385 self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
386 }
387 }
388 }
389
390 pub fn set_service_worker_enabled(&mut self, bypass: bool) {
391 self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
392 }
393
394 pub fn set_block_all(&mut self, block_all: bool) {
395 self.block_all = block_all;
396 }
397
398 pub fn set_request_interception(&mut self, enabled: bool) {
399 self.user_request_interception_enabled = enabled;
400 self.update_protocol_request_interception();
401 }
402
403 pub fn set_cache_enabled(&mut self, enabled: bool) {
404 let run = self.user_cache_disabled != !enabled;
405 self.user_cache_disabled = !enabled;
406 if run {
407 self.update_protocol_cache_disabled();
408 }
409 }
410
411 pub fn enable_request_intercept(&mut self) {
413 self.protocol_request_interception_enabled = true;
414 }
415
416 pub fn disable_request_intercept(&mut self) {
418 self.protocol_request_interception_enabled = false;
419 }
420
421 #[cfg(feature = "_cache")]
423 pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
424 self.cache_site_key = cache_site_key;
425 }
426
427 #[cfg(feature = "_cache")]
429 pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
430 self.cache_policy = cache_policy;
431 }
432
433 pub fn update_protocol_cache_disabled(&mut self) {
434 self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
435 }
436
437 pub fn authenticate(&mut self, credentials: Credentials) {
438 self.credentials = Some(credentials);
439 self.update_protocol_request_interception();
440 self.protocol_request_interception_enabled = true;
441 }
442
443 fn update_protocol_request_interception(&mut self) {
444 let enabled = self.user_request_interception_enabled || self.credentials.is_some();
445
446 if enabled == self.protocol_request_interception_enabled {
447 return;
448 }
449
450 if enabled {
451 self.push_cdp_request(ENABLE_FETCH.clone())
452 } else {
453 self.push_cdp_request(DisableParams::default())
454 }
455 }
456
457 #[inline]
458 fn rel_for_ignore_script<'a>(&self, url: &'a str) -> Cow<'a, str> {
459 if url.starts_with('/') {
460 return Cow::Borrowed(url);
461 }
462
463 let base_raw = self.document_target_domain.as_str();
464
465 if base_raw.is_empty() {
466 return Cow::Borrowed(url);
467 }
468
469 let base = base_domain_from_any(base_raw).trim_end_matches('.');
470 if base.is_empty() {
471 return Cow::Borrowed(url);
472 }
473
474 let brand = first_label(base);
475
476 if let Some((host, rest)) = host_and_rest(url) {
477 if host_is_subdomain_of(host, base) || host_contains_label_icase(host, brand) {
478 return if rest.starts_with('/') {
479 Cow::Borrowed(rest)
480 } else {
481 Cow::Borrowed("/")
482 };
483 }
484 }
485
486 Cow::Borrowed(url)
487 }
488
489 #[inline]
491 pub(crate) fn ignore_script(
492 &self,
493 url: &str,
494 block_analytics: bool,
495 intercept_manager: NetworkInterceptManager,
496 ) -> bool {
497 let mut ignore_script = !url.starts_with("/");
499
500 if !ignore_script
501 && block_analytics
502 && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
503 {
504 ignore_script = true;
505 }
506
507 if !ignore_script {
508 if let Some(index) = url.find("//") {
509 let pos = index + 2;
510
511 if pos < url.len() {
513 if let Some(slash_index) = url[pos..].find('/') {
515 let base_path_index = pos + slash_index + 1;
516
517 if url.len() > base_path_index {
518 let new_url: &str = &url[base_path_index..];
519
520 if !ignore_script
522 && intercept_manager == NetworkInterceptManager::Unknown
523 {
524 let hydration_file =
525 JS_FRAMEWORK_PATH.iter().any(|p| new_url.starts_with(p));
526
527 if hydration_file && new_url.ends_with(".js") {
529 ignore_script = true;
530 }
531 }
532
533 if !ignore_script
534 && URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(new_url)
535 {
536 ignore_script = true;
537 }
538
539 if !ignore_script
540 && self.ignore_visuals
541 && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(new_url)
542 {
543 ignore_script = true;
544 }
545 }
546 }
547 }
548 }
549 }
550
551 if !ignore_script && block_analytics {
553 ignore_script = URL_IGNORE_TRIE_PATHS.contains_prefix(url);
554 }
555
556 ignore_script
557 }
558
559 #[inline]
561 fn skip_xhr(
562 &self,
563 skip_networking: bool,
564 event: &EventRequestPaused,
565 network_event: bool,
566 ) -> bool {
567 if !skip_networking && network_event {
569 let request_url = event.request.url.as_str();
570
571 let skip_analytics =
573 self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
574
575 if skip_analytics {
576 true
577 } else if self.block_stylesheets || self.ignore_visuals {
578 let block_css = self.block_stylesheets;
579 let block_media = self.ignore_visuals;
580
581 let mut block_request = false;
582
583 if let Some(position) = request_url.rfind('.') {
584 let hlen = request_url.len();
585 let has_asset = hlen - position;
586
587 if has_asset >= 3 {
588 let next_position = position + 1;
589
590 if block_media
591 && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
592 &request_url[next_position..].into(),
593 )
594 {
595 block_request = true;
596 } else if block_css {
597 block_request =
598 CaseInsensitiveString::from(request_url[next_position..].as_bytes())
599 .contains(&**CSS_EXTENSION)
600 }
601 }
602 }
603
604 if !block_request {
605 block_request = ignore_script_xhr_media(request_url);
606 }
607
608 block_request
609 } else {
610 skip_networking
611 }
612 } else {
613 skip_networking
614 }
615 }
616
617 #[cfg(feature = "adblock")]
618 #[inline]
619 fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
621 if skip_networking {
622 true
623 } else {
624 self.detect_ad(event)
625 }
626 }
627
628 #[cfg(not(feature = "adblock"))]
630 #[inline]
631 fn detect_ad_if_enabled(&mut self, _event: &EventRequestPaused, skip_networking: bool) -> bool {
632 skip_networking
633 }
634
635 #[inline]
636 fn fail_request_blocked(
638 &mut self,
639 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
640 ) {
641 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
642 request_id.clone(),
643 chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
644 );
645 self.push_cdp_request(params);
646 }
647
648 #[inline]
649 fn fulfill_request_empty_200(
651 &mut self,
652 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
653 ) {
654 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
655 request_id.clone(),
656 200,
657 );
658 self.push_cdp_request(params);
659 }
660
661 #[cfg(feature = "_cache")]
662 #[inline]
663 fn fulfill_request_from_cache(
667 &mut self,
668 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
669 body: &[u8],
670 headers: &std::collections::HashMap<String, String>,
671 status: i64,
672 ) {
673 use crate::cdp::browser_protocol::fetch::HeaderEntry;
674 use crate::handler::network::fetch::FulfillRequestParams;
675 use base64::Engine;
676
677 let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
678
679 for (k, v) in headers.iter() {
680 resp_headers.push(HeaderEntry {
681 name: k.clone().into(),
682 value: v.clone().into(),
683 });
684 }
685
686 let mut params = FulfillRequestParams::new(request_id.clone(), status);
687
688 params.body = Some(
690 base64::engine::general_purpose::STANDARD
691 .encode(body)
692 .into(),
693 );
694
695 params.response_headers = Some(resp_headers);
696
697 self.push_cdp_request(params);
698 }
699
700 #[inline]
701 fn continue_request_with_url(
703 &mut self,
704 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
705 url: Option<&str>,
706 intercept_response: bool,
707 ) {
708 let mut params = ContinueRequestParams::new(request_id.clone());
709 if let Some(url) = url {
710 params.url = Some(url.to_string());
711 params.intercept_response = Some(intercept_response);
712 }
713 self.push_cdp_request(params);
714 }
715
716 #[inline]
718 pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
719 if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
720 return;
721 }
722
723 let resource_type = &event.resource_type;
724
725 if self.block_all {
726 tracing::debug!(
727 "Blocked (block_all): {:?} - {}",
728 event.resource_type,
729 event.request.url
730 );
731 return self.fail_request_blocked(&event.request_id);
732 }
733
734 if let Some(network_id) = event.network_id.as_ref() {
740 if let Some(request_will_be_sent) =
741 self.requests_will_be_sent.remove(network_id.as_ref())
742 {
743 self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
744 } else {
745 self.request_id_to_interception_id
746 .insert(network_id.clone(), event.request_id.clone().into());
747 }
748 }
749
750 let javascript_resource = *resource_type == ResourceType::Script;
752 let document_resource = *resource_type == ResourceType::Document;
753 let network_resource = !document_resource && crate::utils::is_data_resource(resource_type);
754
755 let mut skip_networking =
757 self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(resource_type.as_ref());
758
759 if !skip_networking {
761 skip_networking = self.document_reload_tracker >= 3;
762 }
763
764 let (current_url_cow, had_replacer) =
766 self.handle_document_replacement_and_tracking(event, document_resource);
767
768 let current_url: &str = current_url_cow.as_ref();
769
770 if !skip_networking {
772 if self.xml_document && current_url.ends_with(".xsl") {
774 skip_networking = false;
775 } else {
776 skip_networking = self.should_skip_for_visuals_and_basic_js(
777 resource_type,
778 javascript_resource,
779 current_url,
780 );
781 }
782 }
783
784 skip_networking = self.detect_ad_if_enabled(event, skip_networking);
786
787 if !skip_networking
789 && (self.only_html || self.ignore_visuals)
790 && (javascript_resource || document_resource)
791 {
792 skip_networking = ignore_script_embedded(current_url);
793 }
794
795 if skip_networking && javascript_resource {
797 let rel = self.rel_for_ignore_script(current_url);
798 skip_networking =
799 self.ignore_script(rel.as_ref(), self.block_analytics, self.intercept_manager);
800 }
801
802 skip_networking = self.skip_xhr(skip_networking, event, network_resource);
804
805 if !skip_networking && (javascript_resource || network_resource || document_resource) {
807 skip_networking = self.intercept_manager.intercept_detection(
808 current_url,
809 self.ignore_visuals,
810 network_resource,
811 );
812 }
813
814 if !skip_networking && (javascript_resource || network_resource) {
816 skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
817 }
818
819 if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
821 {
822 skip_networking = false;
823 }
824
825 if skip_networking && self.is_whitelisted(current_url) {
827 skip_networking = false;
828 }
829
830 if skip_networking {
831 tracing::debug!("Blocked: {:?} - {}", resource_type, current_url);
832 self.fulfill_request_empty_200(&event.request_id);
833 } else {
834 #[cfg(feature = "_cache")]
835 {
836 if let (Some(policy), Some(cache_site_key)) =
837 (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
838 {
839 let current_url = format!("{}:{}", event.request.method, ¤t_url);
840
841 if let Some((res, cache_policy)) =
842 crate::cache::remote::get_session_cache_item(cache_site_key, ¤t_url)
843 {
844 if policy.allows_cached(&cache_policy) {
845 tracing::debug!(
846 "Remote Cached: {:?} - {}",
847 resource_type,
848 ¤t_url
849 );
850 return self.fulfill_request_from_cache(
851 &event.request_id,
852 &res.body,
853 &res.headers,
854 res.status as i64,
855 );
856 }
857 }
858 }
859 }
860
861 tracing::debug!("Allowed: {:?} - {}", resource_type, current_url);
863 self.continue_request_with_url(
864 &event.request_id,
865 if had_replacer {
866 Some(current_url)
867 } else {
868 None
869 },
870 !had_replacer,
871 );
872 }
873 }
874
875 pub fn has_target_domain(&self) -> bool {
877 !self.document_target_url.is_empty()
878 }
879
880 pub fn set_page_url(&mut self, page_target_url: String) {
882 let host_base = host_and_rest(&page_target_url)
883 .map(|(h, _)| base_domain_from_host(h))
884 .unwrap_or("");
885
886 self.document_target_domain = host_base.to_string();
887 self.document_target_url = page_target_url;
888 }
889
890 pub fn clear_target_domain(&mut self) {
892 self.document_reload_tracker = 0;
893 self.document_target_url = Default::default();
894 self.document_target_domain = Default::default();
895 }
896 #[inline]
904 fn handle_document_replacement_and_tracking<'a>(
905 &mut self,
906 event: &'a EventRequestPaused,
907 document_resource: bool,
908 ) -> (Cow<'a, str>, bool) {
909 let mut replacer: Option<String> = None;
910 let current_url = event.request.url.as_str();
911
912 if document_resource {
913 if self.document_target_url == current_url {
914 self.document_reload_tracker += 1;
915 } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
916 {
917 let (http_document_replacement, mut https_document_replacement) =
918 if self.document_target_url.starts_with("http://") {
919 (
920 self.document_target_url.replacen("http://", "http//", 1),
921 self.document_target_url.replacen("http://", "https://", 1),
922 )
923 } else {
924 (
925 self.document_target_url.replacen("https://", "https//", 1),
926 self.document_target_url.replacen("https://", "http://", 1),
927 )
928 };
929
930 let trailing = https_document_replacement.ends_with('/');
932 if trailing {
933 https_document_replacement.pop();
934 }
935 if https_document_replacement.ends_with('/') {
936 https_document_replacement.pop();
937 }
938
939 let redirect_mask = format!(
940 "{}{}",
941 https_document_replacement, http_document_replacement
942 );
943
944 if current_url == redirect_mask {
945 replacer = Some(if trailing {
946 format!("{}/", https_document_replacement)
947 } else {
948 https_document_replacement
949 });
950 }
951 }
952
953 if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
954 self.xml_document = true;
955 }
956
957 self.document_target_url = event.request.url.clone();
959 self.document_target_domain = host_and_rest(&self.document_target_url)
960 .map(|(h, _)| base_domain_from_host(h).to_string())
961 .unwrap_or_default();
962 }
963
964 let current_url_cow = match replacer {
965 Some(r) => Cow::Owned(r),
966 None => Cow::Borrowed(event.request.url.as_str()),
967 };
968
969 let had_replacer = matches!(current_url_cow, Cow::Owned(_));
970 (current_url_cow, had_replacer)
971 }
972
973 #[inline]
975 fn should_skip_for_visuals_and_basic_js(
976 &self,
977 resource_type: &ResourceType,
978 javascript_resource: bool,
979 current_url: &str,
980 ) -> bool {
981 (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
982 || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
983 || (self.block_javascript
984 && javascript_resource
985 && self.intercept_manager == NetworkInterceptManager::Unknown
986 && !ALLOWED_MATCHER.is_match(current_url))
987 }
988
989 #[cfg(feature = "adblock")]
991 pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
992 use adblock::{
993 lists::{FilterSet, ParseOptions, RuleTypes},
994 Engine,
995 };
996
997 lazy_static::lazy_static! {
998 static ref AD_ENGINE: Engine = {
999 let mut filter_set = FilterSet::new(false);
1000 let mut rules = ParseOptions::default();
1001 rules.rule_types = RuleTypes::All;
1002
1003 filter_set.add_filters(
1004 &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1005 rules,
1006 );
1007
1008 Engine::from_filter_set(filter_set, true)
1009 };
1010 };
1011
1012 let blockable = ResourceType::Image == event.resource_type
1013 || event.resource_type == ResourceType::Media
1014 || event.resource_type == ResourceType::Stylesheet
1015 || event.resource_type == ResourceType::Document
1016 || event.resource_type == ResourceType::Fetch
1017 || event.resource_type == ResourceType::Xhr;
1018
1019 let u = &event.request.url;
1020
1021 let block_request = blockable
1022 && {
1024 let request = adblock::request::Request::preparsed(
1025 &u,
1026 "example.com",
1027 "example.com",
1028 &event.resource_type.as_ref().to_lowercase(),
1029 !event.request.is_same_site.unwrap_or_default());
1030
1031 AD_ENGINE.check_network_request(&request).matched
1032 };
1033
1034 block_request
1035 }
1036
1037 pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1038 let response = if self
1039 .attempted_authentications
1040 .contains(event.request_id.as_ref())
1041 {
1042 AuthChallengeResponseResponse::CancelAuth
1043 } else if self.credentials.is_some() {
1044 self.attempted_authentications
1045 .insert(event.request_id.clone().into());
1046 AuthChallengeResponseResponse::ProvideCredentials
1047 } else {
1048 AuthChallengeResponseResponse::Default
1049 };
1050
1051 let mut auth = AuthChallengeResponse::new(response);
1052 if let Some(creds) = self.credentials.clone() {
1053 auth.username = Some(creds.username);
1054 auth.password = Some(creds.password);
1055 }
1056 self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1057 }
1058
1059 pub fn set_offline_mode(&mut self, value: bool) {
1061 if self.offline == value {
1062 return;
1063 }
1064 self.offline = value;
1065 if let Ok(network) = EmulateNetworkConditionsParams::builder()
1066 .offline(self.offline)
1067 .latency(0)
1068 .download_throughput(-1.)
1069 .upload_throughput(-1.)
1070 .build()
1071 {
1072 self.push_cdp_request(network);
1073 }
1074 }
1075
1076 pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1078 if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1079 if let Some(interception_id) = self
1080 .request_id_to_interception_id
1081 .remove(event.request_id.as_ref())
1082 {
1083 self.on_request(event, Some(interception_id));
1084 } else {
1085 self.requests_will_be_sent
1087 .insert(event.request_id.clone(), event.clone());
1088 }
1089 } else {
1090 self.on_request(event, None);
1091 }
1092 }
1093
1094 pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1096 if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1097 request.from_memory_cache = true;
1098 }
1099 }
1100
1101 pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1103 let mut request_failed = false;
1104
1105 let mut deducted: u64 = 0;
1107
1108 if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1109 let before = *max_bytes;
1110
1111 let received_bytes: u64 = event.response.encoded_data_length as u64;
1113
1114 let content_length: Option<u64> = event
1116 .response
1117 .headers
1118 .inner()
1119 .get("content-length")
1120 .and_then(|v| v.as_str())
1121 .and_then(|s| s.trim().parse::<u64>().ok());
1122
1123 *max_bytes = max_bytes.saturating_sub(received_bytes);
1125
1126 if let Some(cl) = content_length {
1128 if cl > *max_bytes {
1129 *max_bytes = 0;
1130 }
1131 }
1132
1133 request_failed = *max_bytes == 0;
1134
1135 deducted = before.saturating_sub(*max_bytes);
1137 }
1138
1139 if deducted > 0 {
1141 self.queued_events
1142 .push_back(NetworkEvent::BytesConsumed(deducted));
1143 }
1144
1145 if request_failed && self.max_bytes_allowed.is_some() {
1147 self.set_block_all(true);
1148 }
1149
1150 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1151 request.set_response(event.response.clone());
1152 self.queued_events.push_back(if request_failed {
1153 NetworkEvent::RequestFailed(request)
1154 } else {
1155 NetworkEvent::RequestFinished(request)
1156 });
1157 }
1158 }
1159
1160 pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1162 if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1163 if let Some(interception_id) = request.interception_id.as_ref() {
1164 self.attempted_authentications
1165 .remove(interception_id.as_ref());
1166 }
1167 self.queued_events
1168 .push_back(NetworkEvent::RequestFinished(request));
1169 }
1170 }
1171
1172 pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1174 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1175 request.failure_text = Some(event.error_text.clone());
1176 if let Some(interception_id) = request.interception_id.as_ref() {
1177 self.attempted_authentications
1178 .remove(interception_id.as_ref());
1179 }
1180 self.queued_events
1181 .push_back(NetworkEvent::RequestFailed(request));
1182 }
1183 }
1184
1185 fn on_request(
1187 &mut self,
1188 event: &EventRequestWillBeSent,
1189 interception_id: Option<InterceptionId>,
1190 ) {
1191 let mut redirect_chain = Vec::new();
1192 let mut redirect_location = None;
1193
1194 if let Some(redirect_resp) = &event.redirect_response {
1195 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1196 if is_redirect_status(redirect_resp.status) {
1197 if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1198 if redirect_resp.url != location {
1199 let fixed_location = location.replace(&redirect_resp.url, "");
1200
1201 if !fixed_location.is_empty() {
1202 request.response.as_mut().map(|resp| {
1203 resp.headers.0["Location"] =
1204 serde_json::Value::String(fixed_location.clone());
1205 });
1206 }
1207
1208 redirect_location = Some(fixed_location);
1209 }
1210 }
1211 }
1212
1213 self.handle_request_redirect(
1214 &mut request,
1215 if let Some(redirect_location) = redirect_location {
1216 let mut redirect_resp = redirect_resp.clone();
1217
1218 if !redirect_location.is_empty() {
1219 redirect_resp.headers.0["Location"] =
1220 serde_json::Value::String(redirect_location);
1221 }
1222
1223 redirect_resp
1224 } else {
1225 redirect_resp.clone()
1226 },
1227 );
1228
1229 redirect_chain = std::mem::take(&mut request.redirect_chain);
1230 redirect_chain.push(request);
1231 }
1232 }
1233
1234 let request = HttpRequest::new(
1235 event.request_id.clone(),
1236 event.frame_id.clone(),
1237 interception_id,
1238 self.user_request_interception_enabled,
1239 redirect_chain,
1240 );
1241
1242 self.requests.insert(event.request_id.clone(), request);
1243 self.queued_events
1244 .push_back(NetworkEvent::Request(event.request_id.clone()));
1245 }
1246
1247 fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1249 request.set_response(response);
1250 if let Some(interception_id) = request.interception_id.as_ref() {
1251 self.attempted_authentications
1252 .remove(interception_id.as_ref());
1253 }
1254 }
1255}
1256
1257#[derive(Debug)]
1258pub enum NetworkEvent {
1259 SendCdpRequest((MethodId, serde_json::Value)),
1261 Request(RequestId),
1263 Response(RequestId),
1265 RequestFailed(HttpRequest),
1267 RequestFinished(HttpRequest),
1269 BytesConsumed(u64),
1271}
1272
1273#[cfg(test)]
1274mod tests {
1275 use super::ALLOWED_MATCHER_3RD_PARTY;
1276
1277 #[test]
1278 fn test_allowed_matcher_3rd_party() {
1279 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1281 assert!(
1282 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1283 "expected Cloudflare challenge script to be allowed"
1284 );
1285
1286 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1288 assert!(
1289 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1290 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1291 );
1292
1293 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1295 assert!(ALLOWED_MATCHER_3RD_PARTY
1296 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1297 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1298 }
1299}