#[cfg(any(feature = "adblock", feature = "firewall"))]
use super::blockers::block_websites::block_ads;
use super::blockers::{
block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
xhr::IGNORE_XHR_ASSETS,
};
use crate::auth::Credentials;
#[cfg(feature = "_cache")]
use crate::cache::BasicCachePolicy;
use crate::cmd::CommandChain;
use crate::handler::http::HttpRequest;
use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
use aho_corasick::AhoCorasick;
use case_insensitive_string::CaseInsensitiveString;
use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
use chromiumoxide_cdp::cdp::browser_protocol::network::{
EmulateNetworkConditionsByRuleParams, EventLoadingFailed, EventLoadingFinished,
EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
InterceptionId, NetworkConditions, RequestId, ResourceType, Response, SetCacheDisabledParams,
SetExtraHttpHeadersParams,
};
use chromiumoxide_cdp::cdp::browser_protocol::{
fetch::{
self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
},
network::SetBypassServiceWorkerParams,
};
use chromiumoxide_cdp::cdp::browser_protocol::{
network::EnableParams, security::SetIgnoreCertificateErrorsParams,
};
use chromiumoxide_types::{Command, Method, MethodId};
use hashbrown::{HashMap, HashSet};
use lazy_static::lazy_static;
use reqwest::header::PROXY_AUTHORIZATION;
use spider_network_blocker::intercept_manager::NetworkInterceptManager;
pub use spider_network_blocker::scripts::{
URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
};
use std::borrow::Cow;
use std::collections::VecDeque;
use std::time::Duration;
lazy_static! {
static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
"jquery", "angular",
"react", "vue", "bootstrap",
"d3",
"lodash",
"ajax",
"application",
"app", "main",
"index",
"bundle",
"vendor",
"runtime",
"polyfill",
"scripts",
"es2015.",
"es2020.",
"webpack",
"captcha",
"client",
"/cdn-cgi/challenge-platform/",
"/wp-content/js/", "https://m.stripe.network/",
"https://challenges.cloudflare.com/",
"https://www.google.com/recaptcha/",
"https://google.com/recaptcha/api.js",
"https://www.gstatic.com/recaptcha/",
"https://captcha.px-cloud.net/",
"https://geo.captcha-delivery.com/",
"https://api.leminnow.com/captcha/",
"https://cdn.auth0.com/js/lock/",
"https://captcha.gtimg.com",
"https://client-api.arkoselabs.com/",
"https://www.capy.me/puzzle/",
"https://newassets.hcaptcha.com/",
"https://cdn.auth0.com/client",
"https://js.stripe.com/",
"https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-"
];
pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
"https://m.stripe.network/",
"https://challenges.cloudflare.com/",
"https://js.stripe.com/",
"https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-",
"https://ct.captcha-delivery.com/",
"https://geo.captcha-delivery.com/",
"https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", "https://cdn.auth0.com/client",
"https://captcha.px-cloud.net/",
"https://www.capy.me/puzzle/",
"https://www.gstatic.com/recaptcha/",
"https://google.com/recaptcha/",
"https://www.google.com/recaptcha/",
"https://www.recaptcha.net/recaptcha/",
"https://js.hcaptcha.com/1/api.js",
"https://hcaptcha.com/1/api.js",
"https://js.datadome.co/tags.js",
"https://api-js.datadome.co/",
"https://client.perimeterx.net/",
"https://captcha.px-cdn.net/",
"https://newassets.hcaptcha.com/",
"https://captcha.px-cloud.net/",
"https://s.perimeterx.net/",
"https://api.leminnow.com/captcha/",
"https://client-api.arkoselabs.com/",
"https://static.geetest.com/v4/gt4.js",
"https://static.geetest.com/",
"https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
"https://cdn.perfdrive.com/aperture/",
"https://assets.queue-it.net/",
"discourse-cdn.com/",
"hcaptcha.com",
"/cdn-cgi/challenge-platform/",
"/_Incapsula_Resource"
];
pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
phf::phf_set! {
"_astro/", "_app/immutable"
}
};
pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
"application/pdf",
"application/zip",
"application/x-rar-compressed",
"application/x-tar",
"image/png",
"image/jpeg",
"image/gif",
"image/bmp",
"image/webp",
"image/svg+xml",
"video/mp4",
"video/x-msvideo",
"video/x-matroska",
"video/webm",
"audio/mpeg",
"audio/ogg",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.ms-excel",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.ms-powerpoint",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/x-7z-compressed",
"application/x-rpm",
"application/x-shockwave-flash",
"application/rtf",
};
pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
"Image",
"Media",
"Font"
};
pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
"CspViolationReport",
"Ping",
};
pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
let enable = EnableParams::default();
if let Ok(c) = serde_json::to_value(&enable) {
vec![(enable.identifier(), c)]
} else {
vec![]
}
};
pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
let enable = EnableParams::default();
let mut v = vec![];
if let Ok(c) = serde_json::to_value(&enable) {
v.push((enable.identifier(), c));
}
let ignore = SetIgnoreCertificateErrorsParams::new(true);
if let Ok(ignored) = serde_json::to_value(&ignore) {
v.push((ignore.identifier(), ignored));
}
v
};
pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
fetch::EnableParams::builder()
.handle_auth_requests(true)
.pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
.build()
};
}
pub(crate) fn is_redirect_status(status: i64) -> bool {
matches!(status, 301 | 302 | 303 | 307 | 308)
}
#[derive(Debug)]
pub struct NetworkManager {
queued_events: VecDeque<NetworkEvent>,
ignore_httpserrors: bool,
requests: HashMap<RequestId, HttpRequest>,
requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
extra_headers: std::collections::HashMap<String, String>,
request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
user_cache_disabled: bool,
attempted_authentications: HashSet<RequestId>,
credentials: Option<Credentials>,
pub(crate) user_request_interception_enabled: bool,
block_all: bool,
pub(crate) protocol_request_interception_enabled: bool,
offline: bool,
pub request_timeout: Duration,
pub ignore_visuals: bool,
pub block_stylesheets: bool,
pub block_javascript: bool,
pub block_analytics: bool,
pub block_prefetch: bool,
pub only_html: bool,
pub xml_document: bool,
pub intercept_manager: NetworkInterceptManager,
pub document_reload_tracker: u8,
pub document_target_url: String,
pub document_target_domain: String,
pub max_bytes_allowed: Option<u64>,
#[cfg(feature = "_cache")]
pub cache_site_key: Option<String>,
#[cfg(feature = "_cache")]
pub cache_policy: Option<BasicCachePolicy>,
whitelist_patterns: Vec<String>,
whitelist_matcher: Option<AhoCorasick>,
blacklist_patterns: Vec<String>,
blacklist_matcher: Option<AhoCorasick>,
blacklist_strict: bool,
}
impl NetworkManager {
pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
Self {
queued_events: Default::default(),
ignore_httpserrors,
requests: Default::default(),
requests_will_be_sent: Default::default(),
extra_headers: Default::default(),
request_id_to_interception_id: Default::default(),
user_cache_disabled: false,
attempted_authentications: Default::default(),
credentials: None,
block_all: false,
user_request_interception_enabled: false,
protocol_request_interception_enabled: false,
offline: false,
request_timeout,
ignore_visuals: false,
block_javascript: false,
block_stylesheets: false,
block_prefetch: true,
block_analytics: true,
only_html: false,
xml_document: false,
intercept_manager: NetworkInterceptManager::Unknown,
document_reload_tracker: 0,
document_target_url: String::new(),
document_target_domain: String::new(),
whitelist_patterns: Vec::new(),
whitelist_matcher: None,
blacklist_patterns: Vec::new(),
blacklist_matcher: None,
blacklist_strict: true,
max_bytes_allowed: None,
#[cfg(feature = "_cache")]
cache_site_key: None,
#[cfg(feature = "_cache")]
cache_policy: None,
}
}
pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
where
I: IntoIterator<Item = S>,
S: Into<String>,
{
self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
self.rebuild_whitelist_matcher();
}
pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
where
I: IntoIterator<Item = S>,
S: Into<String>,
{
self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
self.rebuild_blacklist_matcher();
}
pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
self.blacklist_patterns.push(pattern.into());
self.rebuild_blacklist_matcher();
}
pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
where
I: IntoIterator<Item = S>,
S: Into<String>,
{
self.blacklist_patterns
.extend(patterns.into_iter().map(Into::into));
self.rebuild_blacklist_matcher();
}
pub fn clear_blacklist(&mut self) {
self.blacklist_patterns.clear();
self.blacklist_matcher = None;
}
pub fn set_blacklist_strict(&mut self, strict: bool) {
self.blacklist_strict = strict;
}
#[inline]
fn rebuild_blacklist_matcher(&mut self) {
if self.blacklist_patterns.is_empty() {
self.blacklist_matcher = None;
return;
}
self.blacklist_matcher =
AhoCorasick::new(self.blacklist_patterns.iter().map(|s| s.as_str())).ok();
}
#[inline]
fn is_blacklisted(&self, url: &str) -> bool {
self.blacklist_matcher
.as_ref()
.map(|m| m.is_match(url))
.unwrap_or(false)
}
pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
self.whitelist_patterns.push(pattern.into());
self.rebuild_whitelist_matcher();
}
pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
where
I: IntoIterator<Item = S>,
S: Into<String>,
{
self.whitelist_patterns
.extend(patterns.into_iter().map(Into::into));
self.rebuild_whitelist_matcher();
}
#[inline]
fn rebuild_whitelist_matcher(&mut self) {
if self.whitelist_patterns.is_empty() {
self.whitelist_matcher = None;
return;
}
self.whitelist_matcher =
AhoCorasick::new(self.whitelist_patterns.iter().map(|s| s.as_str())).ok();
}
#[inline]
fn is_whitelisted(&self, url: &str) -> bool {
self.whitelist_matcher
.as_ref()
.map(|m| m.is_match(url))
.unwrap_or(false)
}
pub fn init_commands(&self) -> CommandChain {
let cmds = if self.ignore_httpserrors {
INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
} else {
INIT_CHAIN.clone()
};
CommandChain::new(cmds, self.request_timeout)
}
pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
let method = cmd.identifier();
if let Ok(params) = serde_json::to_value(cmd) {
self.queued_events
.push_back(NetworkEvent::SendCdpRequest((method, params)));
}
}
pub fn poll(&mut self) -> Option<NetworkEvent> {
self.queued_events.pop_front()
}
pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
&self.extra_headers
}
pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
self.extra_headers = headers;
self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
self.extra_headers.remove("Proxy-Authorization");
if !self.extra_headers.is_empty() {
if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
}
}
}
pub fn set_service_worker_enabled(&mut self, bypass: bool) {
self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
}
pub fn set_block_all(&mut self, block_all: bool) {
self.block_all = block_all;
}
pub fn set_request_interception(&mut self, enabled: bool) {
self.user_request_interception_enabled = enabled;
self.update_protocol_request_interception();
}
pub fn set_cache_enabled(&mut self, enabled: bool) {
let run = self.user_cache_disabled == enabled;
self.user_cache_disabled = !enabled;
if run {
self.update_protocol_cache_disabled();
}
}
pub fn enable_request_intercept(&mut self) {
self.protocol_request_interception_enabled = true;
}
pub fn disable_request_intercept(&mut self) {
self.protocol_request_interception_enabled = false;
}
#[cfg(feature = "_cache")]
pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
self.cache_site_key = cache_site_key;
}
#[cfg(feature = "_cache")]
pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
self.cache_policy = cache_policy;
}
pub fn update_protocol_cache_disabled(&mut self) {
self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
}
pub fn authenticate(&mut self, credentials: Credentials) {
self.credentials = Some(credentials);
self.update_protocol_request_interception();
self.protocol_request_interception_enabled = true;
}
fn update_protocol_request_interception(&mut self) {
let enabled = self.user_request_interception_enabled || self.credentials.is_some();
if enabled == self.protocol_request_interception_enabled {
return;
}
if enabled {
self.push_cdp_request(ENABLE_FETCH.clone())
} else {
self.push_cdp_request(DisableParams::default())
}
}
#[inline]
fn should_block_script_blocklist_only(&self, url: &str) -> bool {
let block_analytics = self.block_analytics;
if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
{
return true;
}
if crate::handler::blockers::block_websites::block_website(url) {
return true;
}
if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
let p_slash = Self::strip_query_fragment(path_with_slash);
let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
let base = match p_slash.rsplit('/').next() {
Some(b) => b,
None => p_slash,
};
if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
return true;
}
if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
return true;
}
if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
return true;
}
if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
return true;
}
if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
return true;
}
}
false
}
#[inline]
fn url_path_with_leading_slash(url: &str) -> Option<&str> {
let idx = url.find("//")?;
let after_slashes = idx + 2;
let slash_rel = url[after_slashes..].find('/')?;
let slash_idx = after_slashes + slash_rel;
if slash_idx < url.len() {
Some(&url[slash_idx..])
} else {
None
}
}
#[inline]
fn strip_query_fragment(s: &str) -> &str {
let q = s.find('?');
let h = s.find('#');
match (q, h) {
(None, None) => s,
(Some(i), None) => &s[..i],
(None, Some(i)) => &s[..i],
(Some(i), Some(j)) => &s[..i.min(j)],
}
}
#[inline]
fn skip_xhr(
&self,
skip_networking: bool,
event: &EventRequestPaused,
network_event: bool,
) -> bool {
if !skip_networking && network_event {
let request_url = event.request.url.as_str();
let skip_analytics =
self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
if skip_analytics {
true
} else if self.block_stylesheets || self.ignore_visuals {
let block_css = self.block_stylesheets;
let block_media = self.ignore_visuals;
let mut block_request = false;
if let Some(position) = request_url.rfind('.') {
let hlen = request_url.len();
let has_asset = hlen - position;
if has_asset >= 3 {
let next_position = position + 1;
if block_media
&& IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
&request_url[next_position..].into(),
)
{
block_request = true;
} else if block_css {
block_request = CaseInsensitiveString::from(
&request_url.as_bytes()[next_position..],
)
.contains(&**CSS_EXTENSION)
}
}
}
if !block_request {
block_request = ignore_script_xhr_media(request_url);
}
block_request
} else {
skip_networking
}
} else {
skip_networking
}
}
#[cfg(feature = "adblock")]
#[inline]
fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
if skip_networking {
true
} else {
block_ads(&event.request.url) || self.detect_ad(event)
}
}
#[cfg(not(feature = "adblock"))]
#[inline]
fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
use crate::handler::blockers::block_websites::block_ads;
if skip_networking {
true
} else {
block_ads(&event.request.url)
}
}
#[inline]
fn fail_request_blocked(
&mut self,
request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
) {
let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
request_id.clone(),
chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
);
self.push_cdp_request(params);
}
#[inline]
fn fulfill_request_empty_200(
&mut self,
request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
) {
let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
request_id.clone(),
200,
);
self.push_cdp_request(params);
}
#[cfg(feature = "_cache")]
#[inline]
fn fulfill_request_from_cache(
&mut self,
request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
body: &[u8],
headers: &std::collections::HashMap<String, String>,
status: i64,
) {
use crate::cdp::browser_protocol::fetch::HeaderEntry;
use crate::handler::network::fetch::FulfillRequestParams;
use base64::Engine;
let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
for (k, v) in headers.iter() {
resp_headers.push(HeaderEntry {
name: k.clone().into(),
value: v.clone().into(),
});
}
let mut params = FulfillRequestParams::new(request_id.clone(), status);
params.body = Some(
base64::engine::general_purpose::STANDARD
.encode(body)
.into(),
);
params.response_headers = Some(resp_headers);
self.push_cdp_request(params);
}
#[inline]
fn continue_request_with_url(
&mut self,
request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
url: Option<&str>,
intercept_response: bool,
) {
let mut params = ContinueRequestParams::new(request_id.clone());
if let Some(url) = url {
params.url = Some(url.to_string());
params.intercept_response = Some(intercept_response);
}
self.push_cdp_request(params);
}
#[inline]
pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
return;
}
if self.block_all {
tracing::debug!(
"Blocked (block_all): {:?} - {}",
event.resource_type,
event.request.url
);
return self.fail_request_blocked(&event.request_id);
}
if let Some(network_id) = event.network_id.as_ref() {
if let Some(request_will_be_sent) =
self.requests_will_be_sent.remove(network_id.as_ref())
{
self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
} else {
self.request_id_to_interception_id
.insert(network_id.clone(), event.request_id.clone().into());
}
}
let javascript_resource = event.resource_type == ResourceType::Script;
let document_resource = event.resource_type == ResourceType::Document;
let network_resource =
!document_resource && crate::utils::is_data_resource(&event.resource_type);
let mut skip_networking =
self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(event.resource_type.as_ref());
if event.resource_type == ResourceType::Prefetch && !self.block_prefetch {
skip_networking = true;
}
if !skip_networking {
skip_networking = self.document_reload_tracker >= 3;
}
let (current_url_cow, had_replacer) =
self.handle_document_replacement_and_tracking(event, document_resource);
let current_url: &str = current_url_cow.as_ref();
let blacklisted = self.is_blacklisted(current_url);
if !self.blacklist_strict && blacklisted {
skip_networking = true;
}
if !skip_networking {
if self.xml_document && current_url.ends_with(".xsl") {
skip_networking = false;
} else {
skip_networking = self.should_skip_for_visuals_and_basic(&event.resource_type);
}
}
skip_networking = self.detect_ad_if_enabled(event, skip_networking);
if !skip_networking
&& self.block_javascript
&& (self.only_html || self.ignore_visuals)
&& (javascript_resource || document_resource)
{
skip_networking = ignore_script_embedded(current_url);
}
if !skip_networking && javascript_resource {
skip_networking = self.should_block_script_blocklist_only(current_url);
}
skip_networking = self.skip_xhr(skip_networking, event, network_resource);
if !skip_networking && (javascript_resource || network_resource || document_resource) {
skip_networking = self.intercept_manager.intercept_detection(
current_url,
self.ignore_visuals,
network_resource,
);
}
if !skip_networking && (javascript_resource || network_resource) {
skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
}
if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
{
skip_networking = false;
}
if skip_networking && self.is_whitelisted(current_url) {
skip_networking = false;
}
if self.blacklist_strict && blacklisted {
skip_networking = true;
}
if skip_networking {
tracing::debug!("Blocked: {:?} - {}", event.resource_type, current_url);
self.fulfill_request_empty_200(&event.request_id);
} else {
#[cfg(feature = "_cache")]
{
if let (Some(policy), Some(cache_site_key)) =
(self.cache_policy.as_ref(), self.cache_site_key.as_deref())
{
let current_url = format!("{}:{}", event.request.method, ¤t_url);
if let Some((res, cache_policy)) =
crate::cache::remote::get_session_cache_item(cache_site_key, ¤t_url)
{
if policy.allows_cached(&cache_policy) {
tracing::debug!(
"Remote Cached: {:?} - {}",
&event.resource_type,
¤t_url
);
let flat_headers = crate::http::headers_from_multi(&res.headers);
return self.fulfill_request_from_cache(
&event.request_id,
&res.body,
&flat_headers,
res.status as i64,
);
}
}
}
}
tracing::debug!("Allowed: {:?} - {}", event.resource_type, current_url);
self.continue_request_with_url(
&event.request_id,
if had_replacer {
Some(current_url)
} else {
None
},
!had_replacer,
);
}
}
#[inline]
fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
(self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
|| (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
}
pub fn has_target_domain(&self) -> bool {
!self.document_target_url.is_empty()
}
pub fn set_page_url(&mut self, page_target_url: String) {
let host_base = host_and_rest(&page_target_url)
.map(|(h, _)| base_domain_from_host(h))
.unwrap_or("");
self.document_target_domain = host_base.to_string();
self.document_target_url = page_target_url;
}
pub fn clear_target_domain(&mut self) {
self.document_reload_tracker = 0;
self.document_target_url = Default::default();
self.document_target_domain = Default::default();
}
#[inline]
fn handle_document_replacement_and_tracking<'a>(
&mut self,
event: &'a EventRequestPaused,
document_resource: bool,
) -> (Cow<'a, str>, bool) {
let mut replacer: Option<String> = None;
let current_url = event.request.url.as_str();
if document_resource {
if self.document_target_url == current_url {
self.document_reload_tracker += 1;
} else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
{
let (http_document_replacement, mut https_document_replacement) =
if self.document_target_url.starts_with("http://") {
(
self.document_target_url.replacen("http://", "http//", 1),
self.document_target_url.replacen("http://", "https://", 1),
)
} else {
(
self.document_target_url.replacen("https://", "https//", 1),
self.document_target_url.replacen("https://", "http://", 1),
)
};
let trailing = https_document_replacement.ends_with('/');
if trailing {
https_document_replacement.pop();
}
if https_document_replacement.ends_with('/') {
https_document_replacement.pop();
}
let redirect_mask = format!(
"{}{}",
https_document_replacement, http_document_replacement
);
if current_url == redirect_mask {
replacer = Some(if trailing {
format!("{}/", https_document_replacement)
} else {
https_document_replacement
});
}
}
if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
self.xml_document = true;
}
self.document_target_url = event.request.url.clone();
self.document_target_domain = host_and_rest(&self.document_target_url)
.map(|(h, _)| base_domain_from_host(h).to_string())
.unwrap_or_default();
}
let current_url_cow = match replacer {
Some(r) => Cow::Owned(r),
None => Cow::Borrowed(event.request.url.as_str()),
};
let had_replacer = matches!(current_url_cow, Cow::Owned(_));
(current_url_cow, had_replacer)
}
#[cfg(feature = "adblock")]
pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
use adblock::{
lists::{FilterSet, ParseOptions, RuleTypes},
Engine,
};
lazy_static::lazy_static! {
static ref AD_ENGINE: Engine = {
let mut filter_set = FilterSet::new(false);
let mut rules = ParseOptions::default();
rules.rule_types = RuleTypes::All;
filter_set.add_filters(
&*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
rules,
);
Engine::from_filter_set(filter_set, true)
};
};
let blockable = ResourceType::Image == event.resource_type
|| event.resource_type == ResourceType::Media
|| event.resource_type == ResourceType::Stylesheet
|| event.resource_type == ResourceType::Document
|| event.resource_type == ResourceType::Fetch
|| event.resource_type == ResourceType::Xhr;
let u = &event.request.url;
let block_request = blockable
&& {
let request = adblock::request::Request::preparsed(
&u,
"example.com",
"example.com",
&event.resource_type.as_ref().to_lowercase(),
!event.request.is_same_site.unwrap_or_default());
AD_ENGINE.check_network_request(&request).matched
};
block_request
}
pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
let response = if self
.attempted_authentications
.contains(event.request_id.as_ref())
{
AuthChallengeResponseResponse::CancelAuth
} else if self.credentials.is_some() {
self.attempted_authentications
.insert(event.request_id.clone().into());
AuthChallengeResponseResponse::ProvideCredentials
} else {
AuthChallengeResponseResponse::Default
};
let mut auth = AuthChallengeResponse::new(response);
if let Some(creds) = self.credentials.clone() {
auth.username = Some(creds.username);
auth.password = Some(creds.password);
}
self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
}
pub fn set_offline_mode(&mut self, value: bool) {
if self.offline == value {
return;
}
self.offline = value;
if let Ok(network) = EmulateNetworkConditionsByRuleParams::builder()
.offline(self.offline)
.matched_network_condition(
NetworkConditions::builder()
.url_pattern("")
.latency(0)
.download_throughput(-1.)
.upload_throughput(-1.)
.build()
.unwrap(),
)
.build()
{
self.push_cdp_request(network);
}
}
pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
if let Some(interception_id) = self
.request_id_to_interception_id
.remove(event.request_id.as_ref())
{
self.on_request(event, Some(interception_id));
} else {
self.requests_will_be_sent
.insert(event.request_id.clone(), event.clone());
}
} else {
self.on_request(event, None);
}
}
pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
request.from_memory_cache = true;
}
}
pub fn on_response_received(&mut self, event: &EventResponseReceived) {
let mut request_failed = false;
let mut deducted: u64 = 0;
if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
let before = *max_bytes;
let received_bytes: u64 = event.response.encoded_data_length as u64;
let content_length: Option<u64> = event
.response
.headers
.inner()
.get("content-length")
.and_then(|v| v.as_str())
.and_then(|s| s.trim().parse::<u64>().ok());
*max_bytes = max_bytes.saturating_sub(received_bytes);
if let Some(cl) = content_length {
if cl > *max_bytes {
*max_bytes = 0;
}
}
request_failed = *max_bytes == 0;
deducted = before.saturating_sub(*max_bytes);
}
if deducted > 0 {
self.queued_events
.push_back(NetworkEvent::BytesConsumed(deducted));
}
if request_failed && self.max_bytes_allowed.is_some() {
self.set_block_all(true);
}
if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
request.set_response(event.response.clone());
self.queued_events.push_back(if request_failed {
NetworkEvent::RequestFailed(request)
} else {
NetworkEvent::RequestFinished(request)
});
}
}
pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
if let Some(interception_id) = request.interception_id.as_ref() {
self.attempted_authentications
.remove(interception_id.as_ref());
}
self.queued_events
.push_back(NetworkEvent::RequestFinished(request));
}
}
pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
request.failure_text = Some(event.error_text.clone());
if let Some(interception_id) = request.interception_id.as_ref() {
self.attempted_authentications
.remove(interception_id.as_ref());
}
self.queued_events
.push_back(NetworkEvent::RequestFailed(request));
}
}
fn on_request(
&mut self,
event: &EventRequestWillBeSent,
interception_id: Option<InterceptionId>,
) {
let mut redirect_chain = Vec::new();
let mut redirect_location = None;
if let Some(redirect_resp) = &event.redirect_response {
if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
if is_redirect_status(redirect_resp.status) {
if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
if redirect_resp.url != location {
let fixed_location = location.replace(&redirect_resp.url, "");
if !fixed_location.is_empty() {
if let Some(resp) = request.response.as_mut() {
resp.headers.0["Location"] =
serde_json::Value::String(fixed_location.clone());
}
}
redirect_location = Some(fixed_location);
}
}
}
self.handle_request_redirect(
&mut request,
if let Some(redirect_location) = redirect_location {
let mut redirect_resp = redirect_resp.clone();
if !redirect_location.is_empty() {
redirect_resp.headers.0["Location"] =
serde_json::Value::String(redirect_location);
}
redirect_resp
} else {
redirect_resp.clone()
},
);
redirect_chain = std::mem::take(&mut request.redirect_chain);
redirect_chain.push(request);
}
}
let request = HttpRequest::new(
event.request_id.clone(),
event.frame_id.clone(),
interception_id,
self.user_request_interception_enabled,
redirect_chain,
);
self.requests.insert(event.request_id.clone(), request);
self.queued_events
.push_back(NetworkEvent::Request(event.request_id.clone()));
}
fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
request.set_response(response);
if let Some(interception_id) = request.interception_id.as_ref() {
self.attempted_authentications
.remove(interception_id.as_ref());
}
}
}
#[derive(Debug)]
pub enum NetworkEvent {
SendCdpRequest((MethodId, serde_json::Value)),
Request(RequestId),
Response(RequestId),
RequestFailed(HttpRequest),
RequestFinished(HttpRequest),
BytesConsumed(u64),
}
#[cfg(test)]
mod tests {
use super::ALLOWED_MATCHER_3RD_PARTY;
use crate::handler::network::NetworkManager;
use std::time::Duration;
#[test]
fn test_allowed_matcher_3rd_party() {
let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
assert!(
ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
"expected Cloudflare challenge script to be allowed"
);
let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
assert!(
!ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
"expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
);
assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
assert!(ALLOWED_MATCHER_3RD_PARTY
.is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
}
#[test]
fn test_script_allowed_by_default_when_not_blocklisted() {
let mut nm = NetworkManager::new(false, Duration::from_secs(30));
nm.set_page_url(
"https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
);
let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
assert!(
!nm.should_block_script_blocklist_only(ok),
"expected non-blocklisted script to be allowed"
);
}
#[test]
fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
let mut nm = NetworkManager::new(false, Duration::from_secs(30));
nm.set_page_url(
"https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
);
let bad = "https://cdn.example.net/js/analytics.js";
assert!(
nm.should_block_script_blocklist_only(bad),
"expected analytics.js to be blocklisted"
);
}
#[test]
fn test_allowed_matcher_3rd_party_sanity() {
let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
assert!(
ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
"expected Cloudflare challenge script to be allowed"
);
let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
assert!(
!ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
"expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
);
assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
assert!(ALLOWED_MATCHER_3RD_PARTY
.is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
}
#[test]
fn test_dynamic_blacklist_blocks_url() {
let mut nm = NetworkManager::new(false, Duration::from_secs(30));
nm.set_page_url("https://example.com/".to_string());
nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
}
#[test]
fn test_blacklist_strict_wins_over_whitelist() {
let mut nm = NetworkManager::new(false, Duration::from_secs(30));
nm.set_page_url("https://example.com/".to_string());
nm.set_blacklist_patterns(["beacon.min.js"]);
nm.set_whitelist_patterns(["beacon.min.js"]);
nm.set_blacklist_strict(true);
let u = "https://static.cloudflareinsights.com/beacon.min.js";
assert!(nm.is_whitelisted(u));
assert!(nm.is_blacklisted(u));
assert!(nm.blacklist_strict);
}
#[test]
fn test_blacklist_non_strict_allows_whitelist_override() {
let mut nm = NetworkManager::new(false, Duration::from_secs(30));
nm.set_page_url("https://example.com/".to_string());
nm.set_blacklist_patterns(["beacon.min.js"]);
nm.set_whitelist_patterns(["beacon.min.js"]);
nm.set_blacklist_strict(false);
let u = "https://static.cloudflareinsights.com/beacon.min.js";
assert!(nm.is_blacklisted(u));
assert!(nm.is_whitelisted(u));
assert!(!nm.blacklist_strict);
}
}