Skip to main content

stygian_browser/
page.rs

1//! Page and browsing context management for isolated, parallel scraping
2//!
3//! Each `BrowserContext` (future) is an incognito-style isolation boundary (separate
4//! cookies, localStorage, cache).  Each context can contain many [`PageHandle`]s
5//! (tabs).  Both types clean up their CDP resources automatically on drop.
6//!
7//! ## Resource blocking
8//!
9//! Pass a [`ResourceFilter`] to [`PageHandle::set_resource_filter`] to intercept
10//! and block specific request types (images, fonts, CSS) before page load —
11//! significantly reducing page load times for text-only scraping.
12//!
13//! ## Wait strategies
14//!
15//! [`PageHandle`] exposes three wait strategies via [`WaitUntil`]:
16//! - `DomContentLoaded` — fires when the HTML is parsed
17//! - `NetworkIdle` — fires when there are ≤2 in-flight requests for 500 ms
18//! - `Selector(css)` — fires when a CSS selector matches an element
19//!
20//! # Example
21//!
22//! ```no_run
23//! use stygian_browser::{BrowserPool, BrowserConfig};
24//! use stygian_browser::page::{ResourceFilter, WaitUntil};
25//! use std::time::Duration;
26//!
27//! # async fn run() -> stygian_browser::error::Result<()> {
28//! let pool = BrowserPool::new(BrowserConfig::default()).await?;
29//! let handle = pool.acquire().await?;
30//!
31//! let mut page = handle.browser().expect("valid browser").new_page().await?;
32//! page.set_resource_filter(ResourceFilter::block_media()).await?;
33//! page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
34//! let title = page.title().await?;
35//! println!("title: {title}");
36//! handle.release().await;
37//! # Ok(())
38//! # }
39//! ```
40
41use std::time::Duration;
42
43use chromiumoxide::Page;
44use tokio::time::timeout;
45use tracing::{debug, warn};
46
47use crate::error::{BrowserError, Result};
48
49// ─── ResourceType ─────────────────────────────────────────────────────────────
50
51/// CDP resource types that can be intercepted.
52#[derive(Debug, Clone, PartialEq, Eq)]
53pub enum ResourceType {
54    /// `<img>`, `<picture>`, background images
55    Image,
56    /// Web fonts loaded via CSS `@font-face`
57    Font,
58    /// External CSS stylesheets
59    Stylesheet,
60    /// Media files (audio/video)
61    Media,
62}
63
64impl ResourceType {
65    /// Returns the string used in CDP `Network.requestIntercepted` events.
66    pub const fn as_cdp_str(&self) -> &'static str {
67        match self {
68            Self::Image => "Image",
69            Self::Font => "Font",
70            Self::Stylesheet => "Stylesheet",
71            Self::Media => "Media",
72        }
73    }
74}
75
76// ─── ResourceFilter ───────────────────────────────────────────────────────────
77
78/// Set of resource types to block from loading.
79///
80/// # Example
81///
82/// ```
83/// use stygian_browser::page::ResourceFilter;
84/// let filter = ResourceFilter::block_media();
85/// assert!(filter.should_block("Image"));
86/// ```
87#[derive(Debug, Clone, Default)]
88pub struct ResourceFilter {
89    blocked: Vec<ResourceType>,
90}
91
92impl ResourceFilter {
93    /// Block all media resources (images, fonts, CSS, audio/video).
94    pub fn block_media() -> Self {
95        Self {
96            blocked: vec![
97                ResourceType::Image,
98                ResourceType::Font,
99                ResourceType::Stylesheet,
100                ResourceType::Media,
101            ],
102        }
103    }
104
105    /// Block only images and fonts (keep styles for layout-sensitive work).
106    pub fn block_images_and_fonts() -> Self {
107        Self {
108            blocked: vec![ResourceType::Image, ResourceType::Font],
109        }
110    }
111
112    /// Add a resource type to the block list.
113    #[must_use]
114    pub fn block(mut self, resource: ResourceType) -> Self {
115        if !self.blocked.contains(&resource) {
116            self.blocked.push(resource);
117        }
118        self
119    }
120
121    /// Returns `true` if the given CDP resource type string should be blocked.
122    pub fn should_block(&self, cdp_type: &str) -> bool {
123        self.blocked
124            .iter()
125            .any(|r| r.as_cdp_str().eq_ignore_ascii_case(cdp_type))
126    }
127
128    /// Returns `true` if no resource types are blocked.
129    pub const fn is_empty(&self) -> bool {
130        self.blocked.is_empty()
131    }
132}
133
134// ─── WaitUntil ────────────────────────────────────────────────────────────────
135
136/// Condition to wait for after a navigation.
137///
138/// # Example
139///
140/// ```
141/// use stygian_browser::page::WaitUntil;
142/// let w = WaitUntil::Selector("#main".to_string());
143/// assert!(matches!(w, WaitUntil::Selector(_)));
144/// ```
145#[derive(Debug, Clone)]
146pub enum WaitUntil {
147    /// Wait for the `DOMContentLoaded` event.
148    DomContentLoaded,
149    /// Wait until there are ≤2 active network requests for at least 500 ms.
150    NetworkIdle,
151    /// Wait until `document.querySelector(selector)` returns a non-null element.
152    Selector(String),
153}
154
155// ─── PageHandle ───────────────────────────────────────────────────────────────
156
157/// A handle to an open browser tab.
158///
159/// On drop the underlying page is closed automatically.
160///
161/// # Example
162///
163/// ```no_run
164/// use stygian_browser::{BrowserPool, BrowserConfig};
165/// use stygian_browser::page::WaitUntil;
166/// use std::time::Duration;
167///
168/// # async fn run() -> stygian_browser::error::Result<()> {
169/// let pool = BrowserPool::new(BrowserConfig::default()).await?;
170/// let handle = pool.acquire().await?;
171/// let mut page = handle.browser().expect("valid browser").new_page().await?;
172/// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
173/// let html = page.content().await?;
174/// drop(page); // closes the tab
175/// handle.release().await;
176/// # Ok(())
177/// # }
178/// ```
179pub struct PageHandle {
180    page: Page,
181    cdp_timeout: Duration,
182}
183
184impl PageHandle {
185    /// Wrap a raw chromiumoxide [`Page`] in a handle.
186    pub(crate) const fn new(page: Page, cdp_timeout: Duration) -> Self {
187        Self { page, cdp_timeout }
188    }
189
190    /// Navigate to `url` and wait for `condition` within `nav_timeout`.
191    ///
192    /// # Errors
193    ///
194    /// Returns [`BrowserError::NavigationFailed`] if the navigation times out or
195    /// the CDP call fails.
196    pub async fn navigate(
197        &mut self,
198        url: &str,
199        condition: WaitUntil,
200        nav_timeout: Duration,
201    ) -> Result<()> {
202        use chromiumoxide::cdp::browser_protocol::page::EventLoadEventFired;
203        use futures::StreamExt;
204
205        let url_owned = url.to_string();
206
207        let navigate_fut = async {
208            self.page
209                .goto(url)
210                .await
211                .map_err(|e| BrowserError::NavigationFailed {
212                    url: url_owned.clone(),
213                    reason: e.to_string(),
214                })?;
215
216            match &condition {
217                WaitUntil::DomContentLoaded | WaitUntil::NetworkIdle => {
218                    // chromiumoxide's goto() already waits for load; for
219                    // NetworkIdle we listen for the load event as a proxy
220                    // (full idle detection requires request interception which
221                    // is setup separately).
222                    let mut events = self
223                        .page
224                        .event_listener::<EventLoadEventFired>()
225                        .await
226                        .map_err(|e| BrowserError::NavigationFailed {
227                            url: url_owned.clone(),
228                            reason: e.to_string(),
229                        })?;
230                    // consume first event or treat as already fired
231                    let _ = events.next().await;
232                }
233                WaitUntil::Selector(css) => {
234                    self.wait_for_selector(css, nav_timeout).await?;
235                }
236            }
237            Ok(())
238        };
239
240        timeout(nav_timeout, navigate_fut)
241            .await
242            .map_err(|_| BrowserError::NavigationFailed {
243                url: url.to_string(),
244                reason: format!("navigation timed out after {nav_timeout:?}"),
245            })?
246    }
247
248    /// Wait until `document.querySelector(selector)` is non-null (`timeout`).
249    ///
250    /// # Errors
251    ///
252    /// Returns [`BrowserError::NavigationFailed`] if the selector is not found
253    /// within the given timeout.
254    pub async fn wait_for_selector(&self, selector: &str, wait_timeout: Duration) -> Result<()> {
255        let selector_owned = selector.to_string();
256        let poll = async {
257            loop {
258                if self.page.find_element(selector_owned.clone()).await.is_ok() {
259                    return Ok(());
260                }
261                tokio::time::sleep(Duration::from_millis(100)).await;
262            }
263        };
264
265        timeout(wait_timeout, poll)
266            .await
267            .map_err(|_| BrowserError::NavigationFailed {
268                url: String::new(),
269                reason: format!("selector '{selector_owned}' not found within {wait_timeout:?}"),
270            })?
271    }
272
273    /// Set a resource filter to block specific network request types.
274    ///
275    /// **Note:** Requires Network.enable; called automatically.
276    ///
277    /// # Errors
278    ///
279    /// Returns a [`BrowserError::CdpError`] if the CDP call fails.
280    pub async fn set_resource_filter(&mut self, filter: ResourceFilter) -> Result<()> {
281        use chromiumoxide::cdp::browser_protocol::fetch::{EnableParams, RequestPattern};
282
283        if filter.is_empty() {
284            return Ok(());
285        }
286
287        // Both builders are infallible — they return the struct directly (not Result)
288        let pattern = RequestPattern::builder().url_pattern("*").build();
289        let params = EnableParams::builder()
290            .patterns(vec![pattern])
291            .handle_auth_requests(false)
292            .build();
293
294        timeout(self.cdp_timeout, self.page.execute::<EnableParams>(params))
295            .await
296            .map_err(|_| BrowserError::Timeout {
297                operation: "Fetch.enable".to_string(),
298                duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
299            })?
300            .map_err(|e| BrowserError::CdpError {
301                operation: "Fetch.enable".to_string(),
302                message: e.to_string(),
303            })?;
304
305        debug!("Resource filter active: {:?}", filter);
306        Ok(())
307    }
308
309    /// Return the page's `<title>` text.
310    ///
311    /// # Errors
312    ///
313    /// Returns [`BrowserError::ScriptExecutionFailed`] if the evaluation fails.
314    pub async fn title(&self) -> Result<String> {
315        timeout(self.cdp_timeout, self.page.get_title())
316            .await
317            .map_err(|_| BrowserError::Timeout {
318                operation: "get_title".to_string(),
319                duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
320            })?
321            .map_err(|e| BrowserError::ScriptExecutionFailed {
322                script: "document.title".to_string(),
323                reason: e.to_string(),
324            })
325            .map(Option::unwrap_or_default)
326    }
327
328    /// Return the page's full outer HTML.
329    ///
330    /// # Errors
331    ///
332    /// Returns [`BrowserError::ScriptExecutionFailed`] if the evaluation fails.
333    pub async fn content(&self) -> Result<String> {
334        timeout(self.cdp_timeout, self.page.content())
335            .await
336            .map_err(|_| BrowserError::Timeout {
337                operation: "page.content".to_string(),
338                duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
339            })?
340            .map_err(|e| BrowserError::ScriptExecutionFailed {
341                script: "document.documentElement.outerHTML".to_string(),
342                reason: e.to_string(),
343            })
344    }
345
346    /// Evaluate arbitrary JavaScript and return the result as `T`.
347    ///
348    /// # Errors
349    ///
350    /// Returns [`BrowserError::ScriptExecutionFailed`] on eval failure or
351    /// deserialization error.
352    pub async fn eval<T: serde::de::DeserializeOwned>(&self, script: &str) -> Result<T> {
353        let script_owned = script.to_string();
354        timeout(self.cdp_timeout, self.page.evaluate(script))
355            .await
356            .map_err(|_| BrowserError::Timeout {
357                operation: "page.evaluate".to_string(),
358                duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
359            })?
360            .map_err(|e| BrowserError::ScriptExecutionFailed {
361                script: script_owned.clone(),
362                reason: e.to_string(),
363            })?
364            .into_value::<T>()
365            .map_err(|e| BrowserError::ScriptExecutionFailed {
366                script: script_owned,
367                reason: e.to_string(),
368            })
369    }
370
371    /// Save all cookies for the current page's origin.
372    ///
373    /// # Errors
374    ///
375    /// Returns [`BrowserError::CdpError`] if the CDP call fails.
376    pub async fn save_cookies(
377        &self,
378    ) -> Result<Vec<chromiumoxide::cdp::browser_protocol::network::Cookie>> {
379        use chromiumoxide::cdp::browser_protocol::network::GetCookiesParams;
380
381        let url = self
382            .page
383            .url()
384            .await
385            .map_err(|e| BrowserError::CdpError {
386                operation: "page.url".to_string(),
387                message: e.to_string(),
388            })?
389            .unwrap_or_default();
390
391        timeout(
392            self.cdp_timeout,
393            self.page
394                .execute(GetCookiesParams::builder().urls(vec![url]).build()),
395        )
396        .await
397        .map_err(|_| BrowserError::Timeout {
398            operation: "Network.getCookies".to_string(),
399            duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
400        })?
401        .map_err(|e| BrowserError::CdpError {
402            operation: "Network.getCookies".to_string(),
403            message: e.to_string(),
404        })
405        .map(|r| r.cookies.clone())
406    }
407
408    /// Capture a screenshot of the current page as PNG bytes.
409    ///
410    /// The screenshot is full-page by default (viewport clipped to the rendered
411    /// layout area).  Save the returned bytes to a `.png` file or process
412    /// them in-memory.
413    ///
414    /// # Errors
415    ///
416    /// Returns [`BrowserError::CdpError`] if the CDP `Page.captureScreenshot`
417    /// command fails, or [`BrowserError::Timeout`] if it exceeds
418    /// `cdp_timeout`.
419    ///
420    /// # Example
421    ///
422    /// ```no_run
423    /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
424    /// use std::{time::Duration, fs};
425    ///
426    /// # async fn run() -> stygian_browser::error::Result<()> {
427    /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
428    /// let handle = pool.acquire().await?;
429    /// let mut page = handle.browser().expect("valid browser").new_page().await?;
430    /// page.navigate("https://example.com", WaitUntil::Selector("body".to_string()), Duration::from_secs(30)).await?;
431    /// let png = page.screenshot().await?;
432    /// fs::write("screenshot.png", &png).unwrap();
433    /// # Ok(())
434    /// # }
435    /// ```
436    pub async fn screenshot(&self) -> Result<Vec<u8>> {
437        use chromiumoxide::page::ScreenshotParams;
438
439        let params = ScreenshotParams::builder().full_page(true).build();
440
441        timeout(self.cdp_timeout, self.page.screenshot(params))
442            .await
443            .map_err(|_| BrowserError::Timeout {
444                operation: "Page.captureScreenshot".to_string(),
445                duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
446            })?
447            .map_err(|e| BrowserError::CdpError {
448                operation: "Page.captureScreenshot".to_string(),
449                message: e.to_string(),
450            })
451    }
452
453    /// Borrow the underlying chromiumoxide [`Page`].
454    pub const fn inner(&self) -> &Page {
455        &self.page
456    }
457
458    /// Close this page (tab).
459    ///
460    /// Called automatically on drop; explicit call avoids suppressing the error.
461    pub async fn close(self) -> Result<()> {
462        timeout(Duration::from_secs(5), self.page.clone().close())
463            .await
464            .map_err(|_| BrowserError::Timeout {
465                operation: "page.close".to_string(),
466                duration_ms: 5000,
467            })?
468            .map_err(|e| BrowserError::CdpError {
469                operation: "page.close".to_string(),
470                message: e.to_string(),
471            })
472    }
473}
474
475impl Drop for PageHandle {
476    fn drop(&mut self) {
477        warn!("PageHandle dropped without explicit close(); spawning cleanup task");
478        // chromiumoxide Page does not implement close on Drop, so we spawn
479        // a fire-and-forget task. The page ref is already owned; we need to
480        // swap it out. We clone the Page handle (it's Arc-backed internally).
481        let page = self.page.clone();
482        tokio::spawn(async move {
483            let _ = page.close().await;
484        });
485    }
486}
487
488// ─── Tests ────────────────────────────────────────────────────────────────────
489
490#[cfg(test)]
491mod tests {
492    use super::*;
493
494    #[test]
495    fn resource_filter_block_media_blocks_image() {
496        let filter = ResourceFilter::block_media();
497        assert!(filter.should_block("Image"));
498        assert!(filter.should_block("Font"));
499        assert!(filter.should_block("Stylesheet"));
500        assert!(filter.should_block("Media"));
501        assert!(!filter.should_block("Script"));
502        assert!(!filter.should_block("XHR"));
503    }
504
505    #[test]
506    fn resource_filter_case_insensitive() {
507        let filter = ResourceFilter::block_images_and_fonts();
508        assert!(filter.should_block("image")); // lowercase
509        assert!(filter.should_block("IMAGE")); // uppercase
510        assert!(!filter.should_block("Stylesheet"));
511    }
512
513    #[test]
514    fn resource_filter_builder_chain() {
515        let filter = ResourceFilter::default()
516            .block(ResourceType::Image)
517            .block(ResourceType::Font);
518        assert!(filter.should_block("Image"));
519        assert!(filter.should_block("Font"));
520        assert!(!filter.should_block("Stylesheet"));
521    }
522
523    #[test]
524    fn resource_filter_dedup_block() {
525        let filter = ResourceFilter::default()
526            .block(ResourceType::Image)
527            .block(ResourceType::Image); // duplicate
528        assert_eq!(filter.blocked.len(), 1);
529    }
530
531    #[test]
532    fn resource_filter_is_empty_when_default() {
533        assert!(ResourceFilter::default().is_empty());
534        assert!(!ResourceFilter::block_media().is_empty());
535    }
536
537    #[test]
538    fn wait_until_selector_stores_string() {
539        let w = WaitUntil::Selector("#foo".to_string());
540        assert!(matches!(w, WaitUntil::Selector(ref s) if s == "#foo"));
541    }
542
543    #[test]
544    fn resource_type_cdp_str() {
545        assert_eq!(ResourceType::Image.as_cdp_str(), "Image");
546        assert_eq!(ResourceType::Font.as_cdp_str(), "Font");
547        assert_eq!(ResourceType::Stylesheet.as_cdp_str(), "Stylesheet");
548        assert_eq!(ResourceType::Media.as_cdp_str(), "Media");
549    }
550}