stygian_browser/page.rs
1//! Page and browsing context management for isolated, parallel scraping
2//!
3//! Each `BrowserContext` (future) is an incognito-style isolation boundary (separate
4//! cookies, localStorage, cache). Each context can contain many [`PageHandle`]s
5//! (tabs). Both types clean up their CDP resources automatically on drop.
6//!
7//! ## Resource blocking
8//!
9//! Pass a [`ResourceFilter`] to [`PageHandle::set_resource_filter`] to intercept
10//! and block specific request types (images, fonts, CSS) before page load —
11//! significantly reducing page load times for text-only scraping.
12//!
13//! ## Wait strategies
14//!
15//! [`PageHandle`] exposes three wait strategies via [`WaitUntil`]:
16//! - `DomContentLoaded` — fires when the HTML is parsed
17//! - `NetworkIdle` — fires when there are ≤2 in-flight requests for 500 ms
18//! - `Selector(css)` — fires when a CSS selector matches an element
19//!
20//! # Example
21//!
22//! ```no_run
23//! use stygian_browser::{BrowserPool, BrowserConfig};
24//! use stygian_browser::page::{ResourceFilter, WaitUntil};
25//! use std::time::Duration;
26//!
27//! # async fn run() -> stygian_browser::error::Result<()> {
28//! let pool = BrowserPool::new(BrowserConfig::default()).await?;
29//! let handle = pool.acquire().await?;
30//!
31//! let mut page = handle.browser().expect("valid browser").new_page().await?;
32//! page.set_resource_filter(ResourceFilter::block_media()).await?;
33//! page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
34//! let title = page.title().await?;
35//! println!("title: {title}");
36//! handle.release().await;
37//! # Ok(())
38//! # }
39//! ```
40
41use std::time::Duration;
42
43use chromiumoxide::Page;
44use tokio::time::timeout;
45use tracing::{debug, warn};
46
47use crate::error::{BrowserError, Result};
48
49// ─── ResourceType ─────────────────────────────────────────────────────────────
50
51/// CDP resource types that can be intercepted.
52#[derive(Debug, Clone, PartialEq, Eq)]
53pub enum ResourceType {
54 /// `<img>`, `<picture>`, background images
55 Image,
56 /// Web fonts loaded via CSS `@font-face`
57 Font,
58 /// External CSS stylesheets
59 Stylesheet,
60 /// Media files (audio/video)
61 Media,
62}
63
64impl ResourceType {
65 /// Returns the string used in CDP `Network.requestIntercepted` events.
66 pub const fn as_cdp_str(&self) -> &'static str {
67 match self {
68 Self::Image => "Image",
69 Self::Font => "Font",
70 Self::Stylesheet => "Stylesheet",
71 Self::Media => "Media",
72 }
73 }
74}
75
76// ─── ResourceFilter ───────────────────────────────────────────────────────────
77
78/// Set of resource types to block from loading.
79///
80/// # Example
81///
82/// ```
83/// use stygian_browser::page::ResourceFilter;
84/// let filter = ResourceFilter::block_media();
85/// assert!(filter.should_block("Image"));
86/// ```
87#[derive(Debug, Clone, Default)]
88pub struct ResourceFilter {
89 blocked: Vec<ResourceType>,
90}
91
92impl ResourceFilter {
93 /// Block all media resources (images, fonts, CSS, audio/video).
94 pub fn block_media() -> Self {
95 Self {
96 blocked: vec![
97 ResourceType::Image,
98 ResourceType::Font,
99 ResourceType::Stylesheet,
100 ResourceType::Media,
101 ],
102 }
103 }
104
105 /// Block only images and fonts (keep styles for layout-sensitive work).
106 pub fn block_images_and_fonts() -> Self {
107 Self {
108 blocked: vec![ResourceType::Image, ResourceType::Font],
109 }
110 }
111
112 /// Add a resource type to the block list.
113 #[must_use]
114 pub fn block(mut self, resource: ResourceType) -> Self {
115 if !self.blocked.contains(&resource) {
116 self.blocked.push(resource);
117 }
118 self
119 }
120
121 /// Returns `true` if the given CDP resource type string should be blocked.
122 pub fn should_block(&self, cdp_type: &str) -> bool {
123 self.blocked
124 .iter()
125 .any(|r| r.as_cdp_str().eq_ignore_ascii_case(cdp_type))
126 }
127
128 /// Returns `true` if no resource types are blocked.
129 pub const fn is_empty(&self) -> bool {
130 self.blocked.is_empty()
131 }
132}
133
134// ─── WaitUntil ────────────────────────────────────────────────────────────────
135
136/// Condition to wait for after a navigation.
137///
138/// # Example
139///
140/// ```
141/// use stygian_browser::page::WaitUntil;
142/// let w = WaitUntil::Selector("#main".to_string());
143/// assert!(matches!(w, WaitUntil::Selector(_)));
144/// ```
145#[derive(Debug, Clone)]
146pub enum WaitUntil {
147 /// Wait for the `DOMContentLoaded` event.
148 DomContentLoaded,
149 /// Wait until there are ≤2 active network requests for at least 500 ms.
150 NetworkIdle,
151 /// Wait until `document.querySelector(selector)` returns a non-null element.
152 Selector(String),
153}
154
155// ─── PageHandle ───────────────────────────────────────────────────────────────
156
157/// A handle to an open browser tab.
158///
159/// On drop the underlying page is closed automatically.
160///
161/// # Example
162///
163/// ```no_run
164/// use stygian_browser::{BrowserPool, BrowserConfig};
165/// use stygian_browser::page::WaitUntil;
166/// use std::time::Duration;
167///
168/// # async fn run() -> stygian_browser::error::Result<()> {
169/// let pool = BrowserPool::new(BrowserConfig::default()).await?;
170/// let handle = pool.acquire().await?;
171/// let mut page = handle.browser().expect("valid browser").new_page().await?;
172/// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
173/// let html = page.content().await?;
174/// drop(page); // closes the tab
175/// handle.release().await;
176/// # Ok(())
177/// # }
178/// ```
179pub struct PageHandle {
180 page: Page,
181 cdp_timeout: Duration,
182}
183
184impl PageHandle {
185 /// Wrap a raw chromiumoxide [`Page`] in a handle.
186 pub(crate) const fn new(page: Page, cdp_timeout: Duration) -> Self {
187 Self { page, cdp_timeout }
188 }
189
190 /// Navigate to `url` and wait for `condition` within `nav_timeout`.
191 ///
192 /// # Errors
193 ///
194 /// Returns [`BrowserError::NavigationFailed`] if the navigation times out or
195 /// the CDP call fails.
196 pub async fn navigate(
197 &mut self,
198 url: &str,
199 condition: WaitUntil,
200 nav_timeout: Duration,
201 ) -> Result<()> {
202 use chromiumoxide::cdp::browser_protocol::page::EventLoadEventFired;
203 use futures::StreamExt;
204
205 let url_owned = url.to_string();
206
207 let navigate_fut = async {
208 self.page
209 .goto(url)
210 .await
211 .map_err(|e| BrowserError::NavigationFailed {
212 url: url_owned.clone(),
213 reason: e.to_string(),
214 })?;
215
216 match &condition {
217 WaitUntil::DomContentLoaded | WaitUntil::NetworkIdle => {
218 // chromiumoxide's goto() already waits for load; for
219 // NetworkIdle we listen for the load event as a proxy
220 // (full idle detection requires request interception which
221 // is setup separately).
222 let mut events = self
223 .page
224 .event_listener::<EventLoadEventFired>()
225 .await
226 .map_err(|e| BrowserError::NavigationFailed {
227 url: url_owned.clone(),
228 reason: e.to_string(),
229 })?;
230 // consume first event or treat as already fired
231 let _ = events.next().await;
232 }
233 WaitUntil::Selector(css) => {
234 self.wait_for_selector(css, nav_timeout).await?;
235 }
236 }
237 Ok(())
238 };
239
240 timeout(nav_timeout, navigate_fut)
241 .await
242 .map_err(|_| BrowserError::NavigationFailed {
243 url: url.to_string(),
244 reason: format!("navigation timed out after {nav_timeout:?}"),
245 })?
246 }
247
248 /// Wait until `document.querySelector(selector)` is non-null (`timeout`).
249 ///
250 /// # Errors
251 ///
252 /// Returns [`BrowserError::NavigationFailed`] if the selector is not found
253 /// within the given timeout.
254 pub async fn wait_for_selector(&self, selector: &str, wait_timeout: Duration) -> Result<()> {
255 let selector_owned = selector.to_string();
256 let poll = async {
257 loop {
258 if self.page.find_element(selector_owned.clone()).await.is_ok() {
259 return Ok(());
260 }
261 tokio::time::sleep(Duration::from_millis(100)).await;
262 }
263 };
264
265 timeout(wait_timeout, poll)
266 .await
267 .map_err(|_| BrowserError::NavigationFailed {
268 url: String::new(),
269 reason: format!("selector '{selector_owned}' not found within {wait_timeout:?}"),
270 })?
271 }
272
273 /// Set a resource filter to block specific network request types.
274 ///
275 /// **Note:** Requires Network.enable; called automatically.
276 ///
277 /// # Errors
278 ///
279 /// Returns a [`BrowserError::CdpError`] if the CDP call fails.
280 pub async fn set_resource_filter(&mut self, filter: ResourceFilter) -> Result<()> {
281 use chromiumoxide::cdp::browser_protocol::fetch::{EnableParams, RequestPattern};
282
283 if filter.is_empty() {
284 return Ok(());
285 }
286
287 // Both builders are infallible — they return the struct directly (not Result)
288 let pattern = RequestPattern::builder().url_pattern("*").build();
289 let params = EnableParams::builder()
290 .patterns(vec![pattern])
291 .handle_auth_requests(false)
292 .build();
293
294 timeout(self.cdp_timeout, self.page.execute::<EnableParams>(params))
295 .await
296 .map_err(|_| BrowserError::Timeout {
297 operation: "Fetch.enable".to_string(),
298 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
299 })?
300 .map_err(|e| BrowserError::CdpError {
301 operation: "Fetch.enable".to_string(),
302 message: e.to_string(),
303 })?;
304
305 debug!("Resource filter active: {:?}", filter);
306 Ok(())
307 }
308
309 /// Return the page's `<title>` text.
310 ///
311 /// # Errors
312 ///
313 /// Returns [`BrowserError::ScriptExecutionFailed`] if the evaluation fails.
314 pub async fn title(&self) -> Result<String> {
315 timeout(self.cdp_timeout, self.page.get_title())
316 .await
317 .map_err(|_| BrowserError::Timeout {
318 operation: "get_title".to_string(),
319 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
320 })?
321 .map_err(|e| BrowserError::ScriptExecutionFailed {
322 script: "document.title".to_string(),
323 reason: e.to_string(),
324 })
325 .map(Option::unwrap_or_default)
326 }
327
328 /// Return the page's full outer HTML.
329 ///
330 /// # Errors
331 ///
332 /// Returns [`BrowserError::ScriptExecutionFailed`] if the evaluation fails.
333 pub async fn content(&self) -> Result<String> {
334 timeout(self.cdp_timeout, self.page.content())
335 .await
336 .map_err(|_| BrowserError::Timeout {
337 operation: "page.content".to_string(),
338 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
339 })?
340 .map_err(|e| BrowserError::ScriptExecutionFailed {
341 script: "document.documentElement.outerHTML".to_string(),
342 reason: e.to_string(),
343 })
344 }
345
346 /// Evaluate arbitrary JavaScript and return the result as `T`.
347 ///
348 /// # Errors
349 ///
350 /// Returns [`BrowserError::ScriptExecutionFailed`] on eval failure or
351 /// deserialization error.
352 pub async fn eval<T: serde::de::DeserializeOwned>(&self, script: &str) -> Result<T> {
353 let script_owned = script.to_string();
354 timeout(self.cdp_timeout, self.page.evaluate(script))
355 .await
356 .map_err(|_| BrowserError::Timeout {
357 operation: "page.evaluate".to_string(),
358 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
359 })?
360 .map_err(|e| BrowserError::ScriptExecutionFailed {
361 script: script_owned.clone(),
362 reason: e.to_string(),
363 })?
364 .into_value::<T>()
365 .map_err(|e| BrowserError::ScriptExecutionFailed {
366 script: script_owned,
367 reason: e.to_string(),
368 })
369 }
370
371 /// Save all cookies for the current page's origin.
372 ///
373 /// # Errors
374 ///
375 /// Returns [`BrowserError::CdpError`] if the CDP call fails.
376 pub async fn save_cookies(
377 &self,
378 ) -> Result<Vec<chromiumoxide::cdp::browser_protocol::network::Cookie>> {
379 use chromiumoxide::cdp::browser_protocol::network::GetCookiesParams;
380
381 let url = self
382 .page
383 .url()
384 .await
385 .map_err(|e| BrowserError::CdpError {
386 operation: "page.url".to_string(),
387 message: e.to_string(),
388 })?
389 .unwrap_or_default();
390
391 timeout(
392 self.cdp_timeout,
393 self.page
394 .execute(GetCookiesParams::builder().urls(vec![url]).build()),
395 )
396 .await
397 .map_err(|_| BrowserError::Timeout {
398 operation: "Network.getCookies".to_string(),
399 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
400 })?
401 .map_err(|e| BrowserError::CdpError {
402 operation: "Network.getCookies".to_string(),
403 message: e.to_string(),
404 })
405 .map(|r| r.cookies.clone())
406 }
407
408 /// Capture a screenshot of the current page as PNG bytes.
409 ///
410 /// The screenshot is full-page by default (viewport clipped to the rendered
411 /// layout area). Save the returned bytes to a `.png` file or process
412 /// them in-memory.
413 ///
414 /// # Errors
415 ///
416 /// Returns [`BrowserError::CdpError`] if the CDP `Page.captureScreenshot`
417 /// command fails, or [`BrowserError::Timeout`] if it exceeds
418 /// `cdp_timeout`.
419 ///
420 /// # Example
421 ///
422 /// ```no_run
423 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
424 /// use std::{time::Duration, fs};
425 ///
426 /// # async fn run() -> stygian_browser::error::Result<()> {
427 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
428 /// let handle = pool.acquire().await?;
429 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
430 /// page.navigate("https://example.com", WaitUntil::Selector("body".to_string()), Duration::from_secs(30)).await?;
431 /// let png = page.screenshot().await?;
432 /// fs::write("screenshot.png", &png).unwrap();
433 /// # Ok(())
434 /// # }
435 /// ```
436 pub async fn screenshot(&self) -> Result<Vec<u8>> {
437 use chromiumoxide::page::ScreenshotParams;
438
439 let params = ScreenshotParams::builder().full_page(true).build();
440
441 timeout(self.cdp_timeout, self.page.screenshot(params))
442 .await
443 .map_err(|_| BrowserError::Timeout {
444 operation: "Page.captureScreenshot".to_string(),
445 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
446 })?
447 .map_err(|e| BrowserError::CdpError {
448 operation: "Page.captureScreenshot".to_string(),
449 message: e.to_string(),
450 })
451 }
452
453 /// Borrow the underlying chromiumoxide [`Page`].
454 pub const fn inner(&self) -> &Page {
455 &self.page
456 }
457
458 /// Close this page (tab).
459 ///
460 /// Called automatically on drop; explicit call avoids suppressing the error.
461 pub async fn close(self) -> Result<()> {
462 timeout(Duration::from_secs(5), self.page.clone().close())
463 .await
464 .map_err(|_| BrowserError::Timeout {
465 operation: "page.close".to_string(),
466 duration_ms: 5000,
467 })?
468 .map_err(|e| BrowserError::CdpError {
469 operation: "page.close".to_string(),
470 message: e.to_string(),
471 })
472 }
473}
474
475impl Drop for PageHandle {
476 fn drop(&mut self) {
477 warn!("PageHandle dropped without explicit close(); spawning cleanup task");
478 // chromiumoxide Page does not implement close on Drop, so we spawn
479 // a fire-and-forget task. The page ref is already owned; we need to
480 // swap it out. We clone the Page handle (it's Arc-backed internally).
481 let page = self.page.clone();
482 tokio::spawn(async move {
483 let _ = page.close().await;
484 });
485 }
486}
487
488// ─── Tests ────────────────────────────────────────────────────────────────────
489
490#[cfg(test)]
491mod tests {
492 use super::*;
493
494 #[test]
495 fn resource_filter_block_media_blocks_image() {
496 let filter = ResourceFilter::block_media();
497 assert!(filter.should_block("Image"));
498 assert!(filter.should_block("Font"));
499 assert!(filter.should_block("Stylesheet"));
500 assert!(filter.should_block("Media"));
501 assert!(!filter.should_block("Script"));
502 assert!(!filter.should_block("XHR"));
503 }
504
505 #[test]
506 fn resource_filter_case_insensitive() {
507 let filter = ResourceFilter::block_images_and_fonts();
508 assert!(filter.should_block("image")); // lowercase
509 assert!(filter.should_block("IMAGE")); // uppercase
510 assert!(!filter.should_block("Stylesheet"));
511 }
512
513 #[test]
514 fn resource_filter_builder_chain() {
515 let filter = ResourceFilter::default()
516 .block(ResourceType::Image)
517 .block(ResourceType::Font);
518 assert!(filter.should_block("Image"));
519 assert!(filter.should_block("Font"));
520 assert!(!filter.should_block("Stylesheet"));
521 }
522
523 #[test]
524 fn resource_filter_dedup_block() {
525 let filter = ResourceFilter::default()
526 .block(ResourceType::Image)
527 .block(ResourceType::Image); // duplicate
528 assert_eq!(filter.blocked.len(), 1);
529 }
530
531 #[test]
532 fn resource_filter_is_empty_when_default() {
533 assert!(ResourceFilter::default().is_empty());
534 assert!(!ResourceFilter::block_media().is_empty());
535 }
536
537 #[test]
538 fn wait_until_selector_stores_string() {
539 let w = WaitUntil::Selector("#foo".to_string());
540 assert!(matches!(w, WaitUntil::Selector(ref s) if s == "#foo"));
541 }
542
543 #[test]
544 fn resource_type_cdp_str() {
545 assert_eq!(ResourceType::Image.as_cdp_str(), "Image");
546 assert_eq!(ResourceType::Font.as_cdp_str(), "Font");
547 assert_eq!(ResourceType::Stylesheet.as_cdp_str(), "Stylesheet");
548 assert_eq!(ResourceType::Media.as_cdp_str(), "Media");
549 }
550}