Skip to main content

scrapling_browser/
intercept.rs

1//! Request interception logic for blocking unwanted network requests.
2//!
3//! When a browser page has resource blocking or domain blocking enabled, every
4//! outgoing request is evaluated by the functions in this module before it is
5//! allowed to proceed. There are two independent blocking mechanisms:
6//!
7//! 1. **Resource-type blocking** ([`should_block_resource`]) -- drops requests for
8//!    heavyweight resource types like images, fonts, stylesheets, and media when
9//!    `disable_resources` is `true`. The exact list of blocked types is defined in
10//!    [`constants::EXTRA_RESOURCES`].
11//!
12//! 2. **Domain blocking** ([`is_domain_blocked`]) -- drops requests whose hostname
13//!    (or any parent domain suffix) appears in a blocklist. This powers both custom
14//!    domain blocking and the built-in ad/tracker blocklist.
15//!
16//! The top-level [`should_block_request`] function combines both checks and is the
17//! one wired into Playwright's route handler in [`crate::fetcher`].
18
19use std::collections::HashSet;
20
21use crate::constants::EXTRA_RESOURCES;
22
23/// Returns `true` if the given resource type should be blocked when resource blocking is enabled.
24///
25/// The check is skipped entirely when `disable_resources` is `false`. When enabled,
26/// it matches `resource_type` against the list in [`constants::EXTRA_RESOURCES`],
27/// which includes types like `"font"`, `"image"`, `"stylesheet"`, and `"media"`.
28pub fn should_block_resource(resource_type: &str, disable_resources: bool) -> bool {
29    if !disable_resources {
30        return false;
31    }
32    EXTRA_RESOURCES.contains(&resource_type)
33}
34
35/// Returns `true` if `hostname` (or any of its parent domains) appears in `blocked_domains`.
36///
37/// The check walks up the domain suffix chain, so blocking `"doubleclick.net"` will
38/// also block `"ad.doubleclick.net"` and `"sub.ad.doubleclick.net"`. Returns `false`
39/// immediately when `blocked_domains` is empty, avoiding unnecessary work.
40pub fn is_domain_blocked(hostname: &str, blocked_domains: &HashSet<String>) -> bool {
41    if blocked_domains.is_empty() {
42        return false;
43    }
44
45    if blocked_domains.contains(hostname) {
46        return true;
47    }
48
49    // Walk up the domain suffix chain: "sub.ads.example.com" checks
50    // "ads.example.com", "example.com", "com"
51    let mut parts: &str = hostname;
52    while let Some(pos) = parts.find('.') {
53        parts = &parts[pos + 1..];
54        if blocked_domains.contains(parts) {
55            return true;
56        }
57    }
58
59    false
60}
61
62/// Returns `true` if a request should be blocked based on its resource type or domain.
63///
64/// This is the top-level function called from Playwright's route handler. It first
65/// checks resource-type blocking, then parses the URL and checks domain blocking.
66/// If either check matches, the request is blocked.
67pub fn should_block_request(
68    resource_type: &str,
69    url: &str,
70    disable_resources: bool,
71    blocked_domains: &HashSet<String>,
72) -> bool {
73    if should_block_resource(resource_type, disable_resources) {
74        return true;
75    }
76
77    if !blocked_domains.is_empty() {
78        if let Ok(parsed) = url::Url::parse(url) {
79            if let Some(host) = parsed.host_str() {
80                return is_domain_blocked(&host.to_lowercase(), blocked_domains);
81            }
82        }
83    }
84
85    false
86}
87
88#[cfg(test)]
89mod tests {
90    use super::*;
91
92    #[test]
93    fn resource_blocking() {
94        assert!(should_block_resource("font", true));
95        assert!(should_block_resource("image", true));
96        assert!(should_block_resource("stylesheet", true));
97        assert!(!should_block_resource("document", true));
98        assert!(!should_block_resource("font", false));
99    }
100
101    #[test]
102    fn domain_blocking_exact() {
103        let mut domains = HashSet::new();
104        domains.insert("ads.example.com".to_owned());
105        assert!(is_domain_blocked("ads.example.com", &domains));
106        assert!(!is_domain_blocked("example.com", &domains));
107    }
108
109    #[test]
110    fn domain_blocking_suffix() {
111        let mut domains = HashSet::new();
112        domains.insert("doubleclick.net".to_owned());
113        assert!(is_domain_blocked("ad.doubleclick.net", &domains));
114        assert!(is_domain_blocked("sub.ad.doubleclick.net", &domains));
115        assert!(is_domain_blocked("doubleclick.net", &domains));
116        assert!(!is_domain_blocked("notdoubleclick.net", &domains));
117    }
118
119    #[test]
120    fn domain_blocking_empty() {
121        let domains = HashSet::new();
122        assert!(!is_domain_blocked("anything.com", &domains));
123    }
124
125    #[test]
126    fn should_block_request_combined() {
127        let mut domains = HashSet::new();
128        domains.insert("tracker.com".to_owned());
129
130        assert!(should_block_request(
131            "document",
132            "https://tracker.com/pixel",
133            false,
134            &domains
135        ));
136        assert!(should_block_request(
137            "font",
138            "https://cdn.com/font.woff",
139            true,
140            &domains
141        ));
142        assert!(!should_block_request(
143            "document",
144            "https://example.com",
145            false,
146            &domains
147        ));
148    }
149}