#[cfg(any(feature = "adblock", feature = "firewall"))]
use super::blockers::block_websites::block_ads;
use super::blockers::{
block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
xhr::IGNORE_XHR_ASSETS,
};
use crate::auth::Credentials;
#[cfg(feature = "_cache")]
use crate::cache::BasicCachePolicy;
use crate::cmd::CommandChain;
use crate::handler::http::HttpRequest;
use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
use aho_corasick::AhoCorasick;
use case_insensitive_string::CaseInsensitiveString;
use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
use chromiumoxide_cdp::cdp::browser_protocol::network::{
EmulateNetworkConditionsByRuleParams, EventLoadingFailed, EventLoadingFinished,
EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
InterceptionId, NetworkConditions, RequestId, ResourceType, Response, SetCacheDisabledParams,
SetExtraHttpHeadersParams,
};
use chromiumoxide_cdp::cdp::browser_protocol::{
fetch::{
self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
},
network::SetBypassServiceWorkerParams,
};
use chromiumoxide_cdp::cdp::browser_protocol::{
network::EnableParams, security::SetIgnoreCertificateErrorsParams,
};
use chromiumoxide_types::{Command, Method, MethodId};
use hashbrown::{HashMap, HashSet};
use lazy_static::lazy_static;
use reqwest::header::PROXY_AUTHORIZATION;
use spider_network_blocker::intercept_manager::NetworkInterceptManager;
pub use spider_network_blocker::scripts::{
URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
};
use std::borrow::Cow;
use std::collections::VecDeque;
use std::time::{Duration, Instant};
lazy_static! {
static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
"jquery", "angular",
"react", "vue", "bootstrap",
"d3",
"lodash",
"ajax",
"application",
"app", "main",
"index",
"bundle",
"vendor",
"runtime",
"polyfill",
"scripts",
"es2015.",
"es2020.",
"webpack",
"captcha",
"client",
"/cdn-cgi/challenge-platform/",
"/wp-content/js/", "https://m.stripe.network/",
"https://challenges.cloudflare.com/",
"https://www.google.com/recaptcha/",
"https://google.com/recaptcha/api.js",
"https://www.gstatic.com/recaptcha/",
"https://captcha.px-cloud.net/",
"https://geo.captcha-delivery.com/",
"https://api.leminnow.com/captcha/",
"https://cdn.auth0.com/js/lock/",
"https://captcha.gtimg.com",
"https://client-api.arkoselabs.com/",
"https://www.capy.me/puzzle/",
"https://newassets.hcaptcha.com/",
"https://cdn.auth0.com/client",
"https://js.stripe.com/",
"https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-"
];
pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
"https://m.stripe.network/",
"https://challenges.cloudflare.com/",
"https://js.stripe.com/",
"https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-",
"https://ct.captcha-delivery.com/",
"https://geo.captcha-delivery.com/",
"https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", "https://cdn.auth0.com/client",
"https://captcha.px-cloud.net/",
"https://www.capy.me/puzzle/",
"https://www.gstatic.com/recaptcha/",
"https://google.com/recaptcha/",
"https://www.google.com/recaptcha/",
"https://www.recaptcha.net/recaptcha/",
"https://js.hcaptcha.com/1/api.js",
"https://hcaptcha.com/1/api.js",
"https://js.datadome.co/tags.js",
"https://api-js.datadome.co/",
"https://client.perimeterx.net/",
"https://captcha.px-cdn.net/",
"https://newassets.hcaptcha.com/",
"https://captcha.px-cloud.net/",
"https://s.perimeterx.net/",
"https://api.leminnow.com/captcha/",
"https://client-api.arkoselabs.com/",
"https://static.geetest.com/v4/gt4.js",
"https://static.geetest.com/",
"https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
"https://cdn.perfdrive.com/aperture/",
"https://assets.queue-it.net/",
"discourse-cdn.com/",
"hcaptcha.com",
"/cdn-cgi/challenge-platform/",
"/_Incapsula_Resource"
];
pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
phf::phf_set! {
"_astro/", "_app/immutable"
}
};
pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
"application/pdf",
"application/zip",
"application/x-rar-compressed",
"application/x-tar",
"image/png",
"image/jpeg",
"image/gif",
"image/bmp",
"image/webp",
"image/svg+xml",
"video/mp4",
"video/x-msvideo",
"video/x-matroska",
"video/webm",
"audio/mpeg",
"audio/ogg",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.ms-excel",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.ms-powerpoint",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/x-7z-compressed",
"application/x-rpm",
"application/x-shockwave-flash",
"application/rtf",
};
pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
"Image",
"Media",
"Font"
};
pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
"CspViolationReport",
"Ping",
};
pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
let enable = EnableParams::default();
if let Ok(c) = serde_json::to_value(&enable) {
vec![(enable.identifier(), c)]
} else {
vec![]
}
};
pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
let enable = EnableParams::default();
let mut v = vec![];
if let Ok(c) = serde_json::to_value(&enable) {
v.push((enable.identifier(), c));
}
let ignore = SetIgnoreCertificateErrorsParams::new(true);
if let Ok(ignored) = serde_json::to_value(&ignore) {
v.push((ignore.identifier(), ignored));
}
v
};
pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
fetch::EnableParams::builder()
.handle_auth_requests(true)
.pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
.build()
};
}
pub(crate) fn is_redirect_status(status: i64) -> bool {
matches!(status, 301 | 302 | 303 | 307 | 308)
}
const STALE_BUFFER_SECS: u64 = 30;
const STALE_REQUEST_SECS: u64 = 120;
#[cfg(feature = "adblock")]
pub struct AdblockEngine(std::sync::Arc<adblock::Engine>);
#[cfg(feature = "adblock")]
impl std::fmt::Debug for AdblockEngine {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("AdblockEngine").finish()
}
}
#[cfg(feature = "adblock")]
impl std::ops::Deref for AdblockEngine {
type Target = adblock::Engine;
fn deref(&self) -> &Self::Target {
&self.0
}
}
#[derive(Debug)]
pub struct NetworkManager {
queued_events: VecDeque<NetworkEvent>,
ignore_httpserrors: bool,
requests: HashMap<RequestId, HttpRequest>,
requests_will_be_sent: HashMap<RequestId, (EventRequestWillBeSent, Instant)>,
extra_headers: std::collections::HashMap<String, String>,
request_id_to_interception_id: HashMap<RequestId, (InterceptionId, Instant)>,
user_cache_disabled: bool,
attempted_authentications: HashSet<RequestId>,
credentials: Option<Credentials>,
pub(crate) user_request_interception_enabled: bool,
block_all: bool,
pub(crate) protocol_request_interception_enabled: bool,
offline: bool,
pub request_timeout: Duration,
pub ignore_visuals: bool,
pub block_stylesheets: bool,
pub block_javascript: bool,
pub block_analytics: bool,
pub block_prefetch: bool,
pub only_html: bool,
pub xml_document: bool,
pub intercept_manager: NetworkInterceptManager,
pub document_reload_tracker: u8,
pub document_target_url: String,
pub document_target_domain: String,
pub max_bytes_allowed: Option<u64>,
#[cfg(feature = "_cache")]
pub cache_site_key: Option<String>,
#[cfg(feature = "_cache")]
pub cache_policy: Option<BasicCachePolicy>,
whitelist_patterns: Vec<String>,
whitelist_matcher: Option<AhoCorasick>,
blacklist_patterns: Vec<String>,
blacklist_matcher: Option<AhoCorasick>,
blacklist_strict: bool,
#[cfg(feature = "adblock")]
adblock_engine: Option<AdblockEngine>,
}
impl NetworkManager {
pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
Self {
queued_events: Default::default(),
ignore_httpserrors,
requests: Default::default(),
requests_will_be_sent: Default::default(),
extra_headers: Default::default(),
request_id_to_interception_id: Default::default(),
user_cache_disabled: false,
attempted_authentications: Default::default(),
credentials: None,
block_all: false,
user_request_interception_enabled: false,
protocol_request_interception_enabled: false,
offline: false,
request_timeout,
ignore_visuals: false,
block_javascript: false,
block_stylesheets: false,
block_prefetch: true,
block_analytics: true,
only_html: false,
xml_document: false,
intercept_manager: NetworkInterceptManager::Unknown,
document_reload_tracker: 0,
document_target_url: String::new(),
document_target_domain: String::new(),
whitelist_patterns: Vec::new(),
whitelist_matcher: None,
blacklist_patterns: Vec::new(),
blacklist_matcher: None,
blacklist_strict: true,
max_bytes_allowed: None,
#[cfg(feature = "_cache")]
cache_site_key: None,
#[cfg(feature = "_cache")]
cache_policy: None,
#[cfg(feature = "adblock")]
adblock_engine: None,
}
}
#[cfg(feature = "adblock")]
pub fn set_adblock_engine(&mut self, engine: std::sync::Arc<adblock::Engine>) {
self.adblock_engine = Some(AdblockEngine(engine));
}
pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
where
I: IntoIterator<Item = S>,
S: Into<String>,
{
self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
self.rebuild_whitelist_matcher();
}
pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
where
I: IntoIterator<Item = S>,
S: Into<String>,
{
self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
self.rebuild_blacklist_matcher();
}
pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
self.blacklist_patterns.push(pattern.into());
self.rebuild_blacklist_matcher();
}
pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
where
I: IntoIterator<Item = S>,
S: Into<String>,
{
self.blacklist_patterns
.extend(patterns.into_iter().map(Into::into));
self.rebuild_blacklist_matcher();
}
pub fn clear_blacklist(&mut self) {
self.blacklist_patterns.clear();
self.blacklist_matcher = None;
}
pub fn set_blacklist_strict(&mut self, strict: bool) {
self.blacklist_strict = strict;
}
#[inline]
fn rebuild_blacklist_matcher(&mut self) {
if self.blacklist_patterns.is_empty() {
self.blacklist_matcher = None;
return;
}
self.blacklist_matcher =
AhoCorasick::new(self.blacklist_patterns.iter().map(|s| s.as_str())).ok();
}
#[inline]
fn is_blacklisted(&self, url: &str) -> bool {
self.blacklist_matcher
.as_ref()
.map(|m| m.is_match(url))
.unwrap_or(false)
}
pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
self.whitelist_patterns.push(pattern.into());
self.rebuild_whitelist_matcher();
}
pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
where
I: IntoIterator<Item = S>,
S: Into<String>,
{
self.whitelist_patterns
.extend(patterns.into_iter().map(Into::into));
self.rebuild_whitelist_matcher();
}
#[inline]
fn rebuild_whitelist_matcher(&mut self) {
if self.whitelist_patterns.is_empty() {
self.whitelist_matcher = None;
return;
}
self.whitelist_matcher =
AhoCorasick::new(self.whitelist_patterns.iter().map(|s| s.as_str())).ok();
}
#[inline]
fn is_whitelisted(&self, url: &str) -> bool {
self.whitelist_matcher
.as_ref()
.map(|m| m.is_match(url))
.unwrap_or(false)
}
pub fn init_commands(&self) -> CommandChain {
let cmds = if self.ignore_httpserrors {
INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
} else {
INIT_CHAIN.clone()
};
CommandChain::new(cmds, self.request_timeout)
}
pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
let method = cmd.identifier();
if let Ok(params) = serde_json::to_value(cmd) {
self.queued_events
.push_back(NetworkEvent::SendCdpRequest((method, params)));
}
}
pub fn poll(&mut self) -> Option<NetworkEvent> {
self.queued_events.pop_front()
}
pub fn evict_stale_entries(&mut self, now: Instant) {
let cutoff = now - Duration::from_secs(STALE_BUFFER_SECS);
self.requests_will_be_sent.retain(|_, (_, ts)| *ts > cutoff);
self.request_id_to_interception_id
.retain(|_, (_, ts)| *ts > cutoff);
let request_cutoff = now - Duration::from_secs(STALE_REQUEST_SECS);
self.requests
.retain(|_, req| req.created_at > request_cutoff);
if !self.attempted_authentications.is_empty() {
let live: HashSet<&str> = self
.requests
.values()
.filter_map(|r| r.interception_id.as_ref().map(|id| id.as_ref()))
.collect();
self.attempted_authentications
.retain(|id| live.contains(id.as_ref()));
}
}
pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
&self.extra_headers
}
pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
self.extra_headers = headers;
self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
self.extra_headers.remove("Proxy-Authorization");
if !self.extra_headers.is_empty() {
if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
}
}
}
pub fn set_service_worker_enabled(&mut self, bypass: bool) {
self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
}
pub fn set_block_all(&mut self, block_all: bool) {
self.block_all = block_all;
}
pub fn set_request_interception(&mut self, enabled: bool) {
self.user_request_interception_enabled = enabled;
self.update_protocol_request_interception();
}
pub fn set_cache_enabled(&mut self, enabled: bool) {
let run = self.user_cache_disabled == enabled;
self.user_cache_disabled = !enabled;
if run {
self.update_protocol_cache_disabled();
}
}
pub fn enable_request_intercept(&mut self) {
self.protocol_request_interception_enabled = true;
}
pub fn disable_request_intercept(&mut self) {
self.protocol_request_interception_enabled = false;
}
#[cfg(feature = "_cache")]
pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
self.cache_site_key = cache_site_key;
}
#[cfg(feature = "_cache")]
pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
self.cache_policy = cache_policy;
}
pub fn update_protocol_cache_disabled(&mut self) {
self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
}
pub fn authenticate(&mut self, credentials: Credentials) {
self.credentials = Some(credentials);
self.update_protocol_request_interception();
self.protocol_request_interception_enabled = true;
}
fn update_protocol_request_interception(&mut self) {
let enabled = self.user_request_interception_enabled || self.credentials.is_some();
if enabled == self.protocol_request_interception_enabled {
return;
}
if enabled {
self.push_cdp_request(ENABLE_FETCH.clone())
} else {
self.push_cdp_request(DisableParams::default())
}
}
#[inline]
fn should_block_script_blocklist_only(&self, url: &str) -> bool {
let block_analytics = self.block_analytics;
if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
{
return true;
}
if crate::handler::blockers::block_websites::block_website(url) {
return true;
}
if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
let p_slash = Self::strip_query_fragment(path_with_slash);
let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
let base = match p_slash.rsplit('/').next() {
Some(b) => b,
None => p_slash,
};
if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
return true;
}
if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
return true;
}
if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
return true;
}
if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
return true;
}
if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
return true;
}
}
false
}
#[inline]
fn url_path_with_leading_slash(url: &str) -> Option<&str> {
let idx = url.find("//")?;
let after_slashes = idx + 2;
let slash_rel = url[after_slashes..].find('/')?;
let slash_idx = after_slashes + slash_rel;
if slash_idx < url.len() {
Some(&url[slash_idx..])
} else {
None
}
}
#[inline]
fn strip_query_fragment(s: &str) -> &str {
let q = s.find('?');
let h = s.find('#');
match (q, h) {
(None, None) => s,
(Some(i), None) => &s[..i],
(None, Some(i)) => &s[..i],
(Some(i), Some(j)) => &s[..i.min(j)],
}
}
#[inline]
fn skip_xhr(
&self,
skip_networking: bool,
event: &EventRequestPaused,
network_event: bool,
) -> bool {
if !skip_networking && network_event {
let request_url = event.request.url.as_str();
let skip_analytics =
self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
if skip_analytics {
true
} else if self.block_stylesheets || self.ignore_visuals {
let block_css = self.block_stylesheets;
let block_media = self.ignore_visuals;
let mut block_request = false;
if let Some(position) = request_url.rfind('.') {
let hlen = request_url.len();
let has_asset = hlen - position;
if has_asset >= 3 {
let next_position = position + 1;
if block_media
&& IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
&request_url[next_position..].into(),
)
{
block_request = true;
} else if block_css {
block_request = CaseInsensitiveString::from(
&request_url.as_bytes()[next_position..],
)
.contains(&**CSS_EXTENSION)
}
}
}
if !block_request {
block_request = ignore_script_xhr_media(request_url);
}
block_request
} else {
skip_networking
}
} else {
skip_networking
}
}
#[cfg(feature = "adblock")]
#[inline]
fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
if skip_networking {
true
} else {
block_ads(&event.request.url) || self.detect_ad(event)
}
}
#[cfg(not(feature = "adblock"))]
#[inline]
fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
use crate::handler::blockers::block_websites::block_ads;
if skip_networking {
true
} else {
block_ads(&event.request.url)
}
}
#[inline]
fn fail_request_blocked(
&mut self,
request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
) {
let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
request_id.clone(),
chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
);
self.push_cdp_request(params);
}
#[inline]
fn fulfill_request_empty_200(
&mut self,
request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
) {
let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
request_id.clone(),
200,
);
self.push_cdp_request(params);
}
#[cfg(feature = "_cache")]
#[inline]
fn fulfill_request_from_cache(
&mut self,
request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
body: &[u8],
headers: &std::collections::HashMap<String, String>,
status: i64,
) {
use crate::cdp::browser_protocol::fetch::HeaderEntry;
use crate::handler::network::fetch::FulfillRequestParams;
use base64::Engine;
let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
for (k, v) in headers.iter() {
resp_headers.push(HeaderEntry {
name: k.clone().into(),
value: v.clone().into(),
});
}
let mut params = FulfillRequestParams::new(request_id.clone(), status);
params.body = Some(
base64::engine::general_purpose::STANDARD
.encode(body)
.into(),
);
params.response_headers = Some(resp_headers);
self.push_cdp_request(params);
}
#[inline]
fn continue_request_with_url(
&mut self,
request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
url: Option<&str>,
intercept_response: bool,
) {
let mut params = ContinueRequestParams::new(request_id.clone());
if let Some(url) = url {
params.url = Some(url.to_string());
params.intercept_response = Some(intercept_response);
}
self.push_cdp_request(params);
}
#[inline]
pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
return;
}
if self.block_all {
tracing::debug!(
"Blocked (block_all): {:?} - {}",
event.resource_type,
event.request.url
);
return self.fail_request_blocked(&event.request_id);
}
if let Some(network_id) = event.network_id.as_ref() {
if let Some((request_will_be_sent, _)) =
self.requests_will_be_sent.remove(network_id.as_ref())
{
self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
} else {
self.request_id_to_interception_id.insert(
network_id.clone(),
(event.request_id.clone().into(), Instant::now()),
);
}
}
let javascript_resource = event.resource_type == ResourceType::Script;
let document_resource = event.resource_type == ResourceType::Document;
let network_resource =
!document_resource && crate::utils::is_data_resource(&event.resource_type);
let mut skip_networking =
self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(event.resource_type.as_ref());
if event.resource_type == ResourceType::Prefetch && !self.block_prefetch {
skip_networking = true;
}
if !skip_networking {
skip_networking = self.document_reload_tracker >= 3;
}
let (current_url_cow, had_replacer) =
self.handle_document_replacement_and_tracking(event, document_resource);
let current_url: &str = current_url_cow.as_ref();
let blacklisted = self.is_blacklisted(current_url);
if !self.blacklist_strict && blacklisted {
skip_networking = true;
}
if !skip_networking {
if self.xml_document && current_url.ends_with(".xsl") {
skip_networking = false;
} else {
skip_networking = self.should_skip_for_visuals_and_basic(&event.resource_type);
}
}
skip_networking = self.detect_ad_if_enabled(event, skip_networking);
if !skip_networking
&& self.block_javascript
&& (self.only_html || self.ignore_visuals)
&& (javascript_resource
|| document_resource
|| event.resource_type == ResourceType::Stylesheet
|| event.resource_type == ResourceType::Image)
{
skip_networking = ignore_script_embedded(current_url);
}
if !skip_networking && javascript_resource {
skip_networking = self.should_block_script_blocklist_only(current_url);
}
skip_networking = self.skip_xhr(skip_networking, event, network_resource);
if !skip_networking && (javascript_resource || network_resource || document_resource) {
skip_networking = self.intercept_manager.intercept_detection(
current_url,
self.ignore_visuals,
network_resource,
);
}
if !skip_networking && (javascript_resource || network_resource) {
skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
}
if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
{
skip_networking = false;
}
if skip_networking && self.is_whitelisted(current_url) {
skip_networking = false;
}
if self.blacklist_strict && blacklisted {
skip_networking = true;
}
if skip_networking {
tracing::debug!("Blocked: {:?} - {}", event.resource_type, current_url);
self.fulfill_request_empty_200(&event.request_id);
} else {
#[cfg(feature = "_cache")]
{
if let (Some(policy), Some(cache_site_key)) =
(self.cache_policy.as_ref(), self.cache_site_key.as_deref())
{
let current_url = format!("{}:{}", event.request.method, ¤t_url);
if let Some((res, cache_policy)) =
crate::cache::remote::get_session_cache_item(cache_site_key, ¤t_url)
{
if policy.allows_cached(&cache_policy) {
tracing::debug!(
"Remote Cached: {:?} - {}",
&event.resource_type,
¤t_url
);
let flat_headers = crate::http::headers_from_multi(&res.headers);
return self.fulfill_request_from_cache(
&event.request_id,
&res.body,
&flat_headers,
res.status as i64,
);
}
}
}
}
tracing::debug!("Allowed: {:?} - {}", event.resource_type, current_url);
self.continue_request_with_url(
&event.request_id,
if had_replacer {
Some(current_url)
} else {
None
},
!had_replacer,
);
}
}
#[inline]
fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
(self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
|| (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
}
pub fn has_target_domain(&self) -> bool {
!self.document_target_url.is_empty()
}
pub fn set_page_url(&mut self, page_target_url: String) {
let host_base = host_and_rest(&page_target_url)
.map(|(h, _)| base_domain_from_host(h))
.unwrap_or("");
self.document_target_domain = host_base.to_string();
self.document_target_url = page_target_url;
}
pub fn clear_target_domain(&mut self) {
self.document_reload_tracker = 0;
self.document_target_url = Default::default();
self.document_target_domain = Default::default();
}
#[inline]
fn handle_document_replacement_and_tracking<'a>(
&mut self,
event: &'a EventRequestPaused,
document_resource: bool,
) -> (Cow<'a, str>, bool) {
let mut replacer: Option<String> = None;
let current_url = event.request.url.as_str();
if document_resource {
if self.document_target_url == current_url {
self.document_reload_tracker += 1;
} else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
{
let (http_document_replacement, mut https_document_replacement) =
if self.document_target_url.starts_with("http://") {
(
self.document_target_url.replacen("http://", "http//", 1),
self.document_target_url.replacen("http://", "https://", 1),
)
} else {
(
self.document_target_url.replacen("https://", "https//", 1),
self.document_target_url.replacen("https://", "http://", 1),
)
};
let trailing = https_document_replacement.ends_with('/');
if trailing {
https_document_replacement.pop();
}
if https_document_replacement.ends_with('/') {
https_document_replacement.pop();
}
let redirect_mask = format!(
"{}{}",
https_document_replacement, http_document_replacement
);
if current_url == redirect_mask {
replacer = Some(if trailing {
format!("{}/", https_document_replacement)
} else {
https_document_replacement
});
}
}
if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
self.xml_document = true;
}
self.document_target_url = event.request.url.clone();
self.document_target_domain = host_and_rest(&self.document_target_url)
.map(|(h, _)| base_domain_from_host(h).to_string())
.unwrap_or_default();
}
let current_url_cow = match replacer {
Some(r) => Cow::Owned(r),
None => Cow::Borrowed(event.request.url.as_str()),
};
let had_replacer = matches!(current_url_cow, Cow::Owned(_));
(current_url_cow, had_replacer)
}
#[cfg(feature = "adblock")]
pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
use adblock::{
lists::{FilterSet, ParseOptions, RuleTypes},
Engine,
};
lazy_static::lazy_static! {
static ref AD_ENGINE: Engine = {
let mut filter_set = FilterSet::new(false);
let mut rules = ParseOptions::default();
rules.rule_types = RuleTypes::All;
filter_set.add_filters(
&*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
rules.clone(),
);
#[cfg(feature = "adblock_easylist")]
{
static EASYLIST: &str = include_str!(concat!(env!("OUT_DIR"), "/easylist.txt"));
static EASYPRIVACY: &str = include_str!(concat!(env!("OUT_DIR"), "/easyprivacy.txt"));
if !EASYLIST.is_empty() {
filter_set.add_filter_list(EASYLIST, rules.clone());
}
if !EASYPRIVACY.is_empty() {
filter_set.add_filter_list(EASYPRIVACY, rules);
}
}
Engine::from_filter_set(filter_set, true)
};
}
let blockable = event.resource_type == ResourceType::Script
|| event.resource_type == ResourceType::Image
|| event.resource_type == ResourceType::Media
|| event.resource_type == ResourceType::Stylesheet
|| event.resource_type == ResourceType::Document
|| event.resource_type == ResourceType::Fetch
|| event.resource_type == ResourceType::Xhr;
if !blockable {
return false;
}
let u = &event.request.url;
let source_domain = if self.document_target_domain.is_empty() {
"example.com"
} else {
&self.document_target_domain
};
let hostname = u
.strip_prefix("https://")
.or_else(|| u.strip_prefix("http://"))
.and_then(|rest| rest.split('/').next())
.map(|authority| match authority.rfind('@') {
Some(i) => &authority[i + 1..],
None => authority,
})
.and_then(|host_port| host_port.split(':').next())
.unwrap_or(source_domain);
let resource_type_str = match event.resource_type {
ResourceType::Script => "script",
ResourceType::Image => "image",
ResourceType::Media => "media",
ResourceType::Stylesheet => "stylesheet",
ResourceType::Document => "document",
ResourceType::Fetch => "fetch",
ResourceType::Xhr => "xhr",
_ => "other",
};
let request = adblock::request::Request::preparsed(
u,
hostname,
source_domain,
resource_type_str,
!event.request.is_same_site.unwrap_or_default(),
);
let engine: &Engine = match self.adblock_engine.as_ref() {
Some(custom) => custom,
None => &AD_ENGINE,
};
engine.check_network_request(&request).matched
}
pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
let response = if self
.attempted_authentications
.contains(event.request_id.as_ref())
{
AuthChallengeResponseResponse::CancelAuth
} else if self.credentials.is_some() {
self.attempted_authentications
.insert(event.request_id.clone().into());
AuthChallengeResponseResponse::ProvideCredentials
} else {
AuthChallengeResponseResponse::Default
};
let mut auth = AuthChallengeResponse::new(response);
if let Some(creds) = self.credentials.clone() {
auth.username = Some(creds.username);
auth.password = Some(creds.password);
}
self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
}
pub fn set_offline_mode(&mut self, value: bool) {
if self.offline == value {
return;
}
self.offline = value;
if let Ok(condition) = NetworkConditions::builder()
.url_pattern("")
.latency(0)
.download_throughput(-1.)
.upload_throughput(-1.)
.build()
{
if let Ok(network) = EmulateNetworkConditionsByRuleParams::builder()
.offline(self.offline)
.matched_network_condition(condition)
.build()
{
self.push_cdp_request(network);
}
}
}
pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
if let Some((interception_id, _)) = self
.request_id_to_interception_id
.remove(event.request_id.as_ref())
{
self.on_request(event, Some(interception_id));
} else {
self.requests_will_be_sent
.insert(event.request_id.clone(), (event.clone(), Instant::now()));
}
} else {
self.on_request(event, None);
}
}
pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
request.from_memory_cache = true;
}
}
pub fn on_response_received(&mut self, event: &EventResponseReceived) {
let mut request_failed = false;
let mut deducted: u64 = 0;
if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
let before = *max_bytes;
let received_bytes: u64 = event.response.encoded_data_length as u64;
let content_length: Option<u64> = event
.response
.headers
.inner()
.get("content-length")
.and_then(|v| v.as_str())
.and_then(|s| s.trim().parse::<u64>().ok());
*max_bytes = max_bytes.saturating_sub(received_bytes);
if let Some(cl) = content_length {
if cl > *max_bytes {
*max_bytes = 0;
}
}
request_failed = *max_bytes == 0;
deducted = before.saturating_sub(*max_bytes);
}
if deducted > 0 {
self.queued_events
.push_back(NetworkEvent::BytesConsumed(deducted));
}
if request_failed && self.max_bytes_allowed.is_some() {
self.set_block_all(true);
}
if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
request.set_response(event.response.clone());
self.queued_events.push_back(if request_failed {
NetworkEvent::RequestFailed(request)
} else {
NetworkEvent::RequestFinished(request)
});
}
}
pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
if let Some(interception_id) = request.interception_id.as_ref() {
self.attempted_authentications
.remove(interception_id.as_ref());
}
self.queued_events
.push_back(NetworkEvent::RequestFinished(request));
}
}
pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
request.failure_text = Some(event.error_text.clone());
if let Some(interception_id) = request.interception_id.as_ref() {
self.attempted_authentications
.remove(interception_id.as_ref());
}
self.queued_events
.push_back(NetworkEvent::RequestFailed(request));
}
}
fn on_request(
&mut self,
event: &EventRequestWillBeSent,
interception_id: Option<InterceptionId>,
) {
let mut redirect_chain = Vec::new();
let mut redirect_location = None;
if let Some(redirect_resp) = &event.redirect_response {
if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
if is_redirect_status(redirect_resp.status) {
if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
if redirect_resp.url != location {
let fixed_location = location.replace(&redirect_resp.url, "");
if !fixed_location.is_empty() {
if let Some(resp) = request.response.as_mut() {
resp.headers.0["Location"] =
serde_json::Value::String(fixed_location.clone());
}
}
redirect_location = Some(fixed_location);
}
}
}
{
let mut redirect_resp = redirect_resp.clone();
if let Some(redirect_location) = redirect_location {
if !redirect_location.is_empty() {
redirect_resp.headers.0["Location"] =
serde_json::Value::String(redirect_location);
}
}
self.handle_request_redirect(&mut request, redirect_resp);
}
redirect_chain = std::mem::take(&mut request.redirect_chain);
redirect_chain.push(request);
}
}
let request = HttpRequest::new(
event.request_id.clone(),
event.frame_id.clone(),
interception_id,
self.user_request_interception_enabled,
redirect_chain,
);
self.requests.insert(event.request_id.clone(), request);
self.queued_events
.push_back(NetworkEvent::Request(event.request_id.clone()));
}
fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
request.set_response(response);
if let Some(interception_id) = request.interception_id.as_ref() {
self.attempted_authentications
.remove(interception_id.as_ref());
}
}
}
#[derive(Debug)]
pub enum NetworkEvent {
SendCdpRequest((MethodId, serde_json::Value)),
Request(RequestId),
Response(RequestId),
RequestFailed(HttpRequest),
RequestFinished(HttpRequest),
BytesConsumed(u64),
}
#[cfg(test)]
mod tests {
use super::ALLOWED_MATCHER_3RD_PARTY;
use crate::handler::network::NetworkManager;
use std::time::Duration;
#[test]
fn test_allowed_matcher_3rd_party() {
let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
assert!(
ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
"expected Cloudflare challenge script to be allowed"
);
let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
assert!(
!ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
"expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
);
assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
assert!(ALLOWED_MATCHER_3RD_PARTY
.is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
}
#[test]
fn test_script_allowed_by_default_when_not_blocklisted() {
let mut nm = NetworkManager::new(false, Duration::from_secs(30));
nm.set_page_url(
"https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
);
let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
assert!(
!nm.should_block_script_blocklist_only(ok),
"expected non-blocklisted script to be allowed"
);
}
#[test]
fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
let mut nm = NetworkManager::new(false, Duration::from_secs(30));
nm.set_page_url(
"https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
);
let bad = "https://cdn.example.net/js/analytics.js";
assert!(
nm.should_block_script_blocklist_only(bad),
"expected analytics.js to be blocklisted"
);
}
#[test]
fn test_allowed_matcher_3rd_party_sanity() {
let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
assert!(
ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
"expected Cloudflare challenge script to be allowed"
);
let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
assert!(
!ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
"expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
);
assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
assert!(ALLOWED_MATCHER_3RD_PARTY
.is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
}
#[test]
fn test_dynamic_blacklist_blocks_url() {
let mut nm = NetworkManager::new(false, Duration::from_secs(30));
nm.set_page_url("https://example.com/".to_string());
nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
}
#[test]
fn test_blacklist_strict_wins_over_whitelist() {
let mut nm = NetworkManager::new(false, Duration::from_secs(30));
nm.set_page_url("https://example.com/".to_string());
nm.set_blacklist_patterns(["beacon.min.js"]);
nm.set_whitelist_patterns(["beacon.min.js"]);
nm.set_blacklist_strict(true);
let u = "https://static.cloudflareinsights.com/beacon.min.js";
assert!(nm.is_whitelisted(u));
assert!(nm.is_blacklisted(u));
assert!(nm.blacklist_strict);
}
#[cfg(feature = "adblock")]
fn make_request_paused(
url: &str,
resource_type: chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType,
is_same_site: bool,
) -> chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused {
use chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused;
use chromiumoxide_cdp::cdp::browser_protocol::network::{
Headers, Request, RequestReferrerPolicy, ResourcePriority,
};
EventRequestPaused {
request_id: chromiumoxide_cdp::cdp::browser_protocol::network::RequestId::from(
"test-req".to_string(),
)
.into(),
request: Request {
url: url.to_string(),
method: "GET".to_string(),
headers: Headers::new(serde_json::Value::Object(Default::default())),
initial_priority: ResourcePriority::Medium,
referrer_policy: RequestReferrerPolicy::NoReferrer,
url_fragment: None,
has_post_data: None,
post_data_entries: None,
mixed_content_type: None,
is_link_preload: None,
trust_token_params: None,
is_same_site: Some(is_same_site),
is_ad_related: None,
},
frame_id: chromiumoxide_cdp::cdp::browser_protocol::page::FrameId::from(
"frame1".to_string(),
),
resource_type,
response_error_reason: None,
response_status_code: None,
response_status_text: None,
response_headers: None,
network_id: None,
redirected_request_id: None,
}
}
#[cfg(feature = "adblock")]
#[test]
fn test_detect_ad_blocks_known_tracker_scripts() {
use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
let mut nm = NetworkManager::new(false, Duration::from_secs(30));
nm.set_page_url("https://www.wine-searcher.com/".to_string());
let event = make_request_paused(
"https://www.googletagmanager.com/gtm.js?id=GTM-XXXX",
ResourceType::Script,
false,
);
assert!(
nm.detect_ad(&event),
"googletagmanager.com script should be detected as ad"
);
}
#[cfg(feature = "adblock")]
#[test]
fn test_detect_ad_allows_legitimate_scripts() {
use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
let mut nm = NetworkManager::new(false, Duration::from_secs(30));
nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
let event = make_request_paused(
"https://www.mylegitsite-test.com/static/js/app-bundle.js",
ResourceType::Script,
true,
);
assert!(
!nm.detect_ad(&event),
"legitimate first-party app bundle should not be blocked"
);
}
#[cfg(feature = "adblock")]
#[test]
fn test_detect_ad_uses_source_domain() {
use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
let mut nm = NetworkManager::new(false, Duration::from_secs(30));
nm.set_page_url("https://www.wine-searcher.com/some-page".to_string());
assert!(
!nm.document_target_domain.is_empty(),
"document_target_domain should be set after set_page_url"
);
let event = make_request_paused(
"https://www.google-analytics.com/analytics.js",
ResourceType::Script,
false,
);
assert!(
nm.detect_ad(&event),
"google-analytics.com should be blocked as tracker"
);
}
#[cfg(feature = "adblock")]
#[test]
fn test_custom_adblock_engine_takes_precedence() {
use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
let mut nm = NetworkManager::new(false, Duration::from_secs(30));
nm.set_page_url("https://example.com/".to_string());
let mut filter_set = adblock::lists::FilterSet::new(false);
let mut opts = adblock::lists::ParseOptions::default();
opts.rule_types = adblock::lists::RuleTypes::All;
filter_set.add_filters(["||custom-tracker.example.net^"], opts);
let engine = adblock::Engine::from_filter_set(filter_set, true);
nm.set_adblock_engine(std::sync::Arc::new(engine));
let event = make_request_paused(
"https://custom-tracker.example.net/pixel.js",
ResourceType::Script,
false,
);
assert!(
nm.detect_ad(&event),
"custom engine rule should block custom-tracker.example.net"
);
}
#[cfg(feature = "adblock")]
fn run_full_interception(
nm: &mut NetworkManager,
url: &str,
resource_type: chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType,
is_same_site: bool,
) -> bool {
use super::NetworkEvent;
while nm.poll().is_some() {}
let event = make_request_paused(url, resource_type, is_same_site);
nm.on_fetch_request_paused(&event);
let mut blocked = false;
while let Some(ev) = nm.poll() {
if let NetworkEvent::SendCdpRequest((method, _)) = &ev {
let m: &str = method.as_ref();
if m == "Fetch.fulfillRequest" || m == "Fetch.failRequest" {
blocked = true;
}
}
}
blocked
}
#[cfg(feature = "adblock")]
#[test]
fn test_e2e_tracker_script_blocked() {
use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
let mut nm = NetworkManager::new(false, Duration::from_secs(30));
nm.set_page_url("https://www.wine-searcher.com/".to_string());
assert!(
run_full_interception(
&mut nm,
"https://www.googletagmanager.com/gtm.js?id=GTM-XXXX",
ResourceType::Script,
false,
),
"GTM script should be blocked through full pipeline"
);
}
#[cfg(feature = "adblock")]
#[test]
fn test_e2e_legitimate_script_allowed() {
use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
let mut nm = NetworkManager::new(false, Duration::from_secs(30));
nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
assert!(
!run_full_interception(
&mut nm,
"https://www.mylegitsite-test.com/static/js/app-bundle.js",
ResourceType::Script,
true,
),
"legitimate first-party script should be allowed through full pipeline"
);
}
#[cfg(feature = "adblock")]
#[test]
fn test_e2e_analytics_xhr_blocked() {
use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
let mut nm = NetworkManager::new(false, Duration::from_secs(30));
nm.set_page_url("https://example.org/".to_string());
assert!(
run_full_interception(
&mut nm,
"https://www.google-analytics.com/g/collect?v=2&tid=UA-123",
ResourceType::Xhr,
false,
),
"Google Analytics XHR should be blocked through full pipeline"
);
}
#[cfg(feature = "adblock")]
#[test]
fn test_e2e_whitelisted_overrides_adblock() {
use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
let mut nm = NetworkManager::new(false, Duration::from_secs(30));
nm.set_page_url("https://example.org/".to_string());
nm.set_whitelist_patterns(["googletagmanager.com"]);
assert!(
!run_full_interception(
&mut nm,
"https://www.googletagmanager.com/gtm.js?id=GTM-TEST",
ResourceType::Script,
false,
),
"whitelisted tracker should be allowed even when adblock would block it"
);
}
#[cfg(feature = "adblock")]
#[test]
fn test_e2e_blacklist_strict_overrides_whitelist() {
use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
let mut nm = NetworkManager::new(false, Duration::from_secs(30));
nm.set_page_url("https://example.org/".to_string());
nm.set_blacklist_patterns(["cdn.example.net/evil.js"]);
nm.set_whitelist_patterns(["cdn.example.net/evil.js"]);
nm.set_blacklist_strict(true);
assert!(
run_full_interception(
&mut nm,
"https://cdn.example.net/evil.js",
ResourceType::Script,
false,
),
"strict blacklist should win over whitelist"
);
}
#[cfg(feature = "adblock")]
#[test]
fn test_e2e_first_party_document_not_blocked() {
use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
let mut nm = NetworkManager::new(false, Duration::from_secs(30));
nm.set_page_url("https://www.nytimes.com/".to_string());
assert!(
!run_full_interception(
&mut nm,
"https://www.nytimes.com/2024/article.html",
ResourceType::Document,
true,
),
"first-party document navigation should never be blocked"
);
}
#[cfg(feature = "adblock")]
#[test]
fn test_e2e_custom_engine_blocks_through_pipeline() {
use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
let mut nm = NetworkManager::new(false, Duration::from_secs(30));
nm.set_page_url("https://mysite.com/".to_string());
let mut filter_set = adblock::lists::FilterSet::new(false);
let mut opts = adblock::lists::ParseOptions::default();
opts.rule_types = adblock::lists::RuleTypes::All;
filter_set.add_filters(["||evil-cdn.example.net^$script"], opts);
let engine = adblock::Engine::from_filter_set(filter_set, true);
nm.set_adblock_engine(std::sync::Arc::new(engine));
assert!(
run_full_interception(
&mut nm,
"https://evil-cdn.example.net/tracker.js",
ResourceType::Script,
false,
),
"custom engine rule should block through full pipeline"
);
assert!(
!run_full_interception(
&mut nm,
"https://mysite.com/app.js",
ResourceType::Script,
true,
),
"first-party script should still be allowed with custom engine"
);
}
#[cfg(feature = "adblock")]
#[test]
fn test_e2e_ad_image_blocked() {
use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
let mut nm = NetworkManager::new(false, Duration::from_secs(30));
nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
assert!(
run_full_interception(
&mut nm,
"https://googleads.g.doubleclick.net/pagead/viewthroughconversion/123/?random=456",
ResourceType::Image,
false,
),
"doubleclick ad image/tracking pixel should be blocked"
);
assert!(
!run_full_interception(
&mut nm,
"https://www.mylegitsite-test.com/images/logo.png",
ResourceType::Image,
true,
),
"legitimate first-party image should not be blocked"
);
}
#[cfg(feature = "adblock")]
#[test]
fn test_e2e_hostname_with_userinfo() {
use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
let mut nm = NetworkManager::new(false, Duration::from_secs(30));
nm.set_page_url("https://example.org/".to_string());
assert!(
run_full_interception(
&mut nm,
"https://user:pass@www.googletagmanager.com/gtm.js?id=GTM-XXXX",
ResourceType::Script,
false,
),
"tracker URL with userinfo should still be blocked"
);
}
#[test]
fn test_blacklist_non_strict_allows_whitelist_override() {
let mut nm = NetworkManager::new(false, Duration::from_secs(30));
nm.set_page_url("https://example.com/".to_string());
nm.set_blacklist_patterns(["beacon.min.js"]);
nm.set_whitelist_patterns(["beacon.min.js"]);
nm.set_blacklist_strict(false);
let u = "https://static.cloudflareinsights.com/beacon.min.js";
assert!(nm.is_blacklisted(u));
assert!(nm.is_whitelisted(u));
assert!(!nm.blacklist_strict);
}
}