stygian_browser/page.rs
1//! Page and browsing context management for isolated, parallel scraping
2//!
3//! Each `BrowserContext` (future) is an incognito-style isolation boundary (separate
4//! cookies, localStorage, cache). Each context can contain many [`PageHandle`]s
5//! (tabs). Both types clean up their CDP resources automatically on drop.
6//!
7//! ## Resource blocking
8//!
9//! Pass a [`ResourceFilter`] to [`PageHandle::set_resource_filter`] to intercept
10//! and block specific request types (images, fonts, CSS) before page load —
11//! significantly reducing page load times for text-only scraping.
12//!
13//! ## Wait strategies
14//!
15//! [`PageHandle`] exposes three wait strategies via [`WaitUntil`]:
16//! - `DomContentLoaded` — fires when the HTML is parsed
17//! - `NetworkIdle` — fires when there are ≤2 in-flight requests for 500 ms
18//! - `Selector(css)` — fires when a CSS selector matches an element
19//!
20//! # Example
21//!
22//! ```no_run
23//! use stygian_browser::{BrowserPool, BrowserConfig};
24//! use stygian_browser::page::{ResourceFilter, WaitUntil};
25//! use std::time::Duration;
26//!
27//! # async fn run() -> stygian_browser::error::Result<()> {
28//! let pool = BrowserPool::new(BrowserConfig::default()).await?;
29//! let handle = pool.acquire().await?;
30//!
31//! let mut page = handle.browser().expect("valid browser").new_page().await?;
32//! page.set_resource_filter(ResourceFilter::block_media()).await?;
33//! page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
34//! let title = page.title().await?;
35//! println!("title: {title}");
36//! handle.release().await;
37//! # Ok(())
38//! # }
39//! ```
40
41use std::collections::HashMap;
42use std::sync::{
43 Arc,
44 atomic::{AtomicU16, Ordering},
45};
46use std::time::Duration;
47
48use chromiumoxide::Page;
49use tokio::time::timeout;
50use tracing::{debug, warn};
51
52use crate::error::{BrowserError, Result};
53
54// ─── ResourceType ─────────────────────────────────────────────────────────────
55
56/// CDP resource types that can be intercepted.
57#[derive(Debug, Clone, PartialEq, Eq)]
58pub enum ResourceType {
59 /// `<img>`, `<picture>`, background images
60 Image,
61 /// Web fonts loaded via CSS `@font-face`
62 Font,
63 /// External CSS stylesheets
64 Stylesheet,
65 /// Media files (audio/video)
66 Media,
67}
68
69impl ResourceType {
70 /// Returns the string used in CDP `Network.requestIntercepted` events.
71 pub const fn as_cdp_str(&self) -> &'static str {
72 match self {
73 Self::Image => "Image",
74 Self::Font => "Font",
75 Self::Stylesheet => "Stylesheet",
76 Self::Media => "Media",
77 }
78 }
79}
80
81// ─── ResourceFilter ───────────────────────────────────────────────────────────
82
83/// Set of resource types to block from loading.
84///
85/// # Example
86///
87/// ```
88/// use stygian_browser::page::ResourceFilter;
89/// let filter = ResourceFilter::block_media();
90/// assert!(filter.should_block("Image"));
91/// ```
92#[derive(Debug, Clone, Default)]
93pub struct ResourceFilter {
94 blocked: Vec<ResourceType>,
95}
96
97impl ResourceFilter {
98 /// Block all media resources (images, fonts, CSS, audio/video).
99 pub fn block_media() -> Self {
100 Self {
101 blocked: vec![
102 ResourceType::Image,
103 ResourceType::Font,
104 ResourceType::Stylesheet,
105 ResourceType::Media,
106 ],
107 }
108 }
109
110 /// Block only images and fonts (keep styles for layout-sensitive work).
111 pub fn block_images_and_fonts() -> Self {
112 Self {
113 blocked: vec![ResourceType::Image, ResourceType::Font],
114 }
115 }
116
117 /// Add a resource type to the block list.
118 #[must_use]
119 pub fn block(mut self, resource: ResourceType) -> Self {
120 if !self.blocked.contains(&resource) {
121 self.blocked.push(resource);
122 }
123 self
124 }
125
126 /// Returns `true` if the given CDP resource type string should be blocked.
127 pub fn should_block(&self, cdp_type: &str) -> bool {
128 self.blocked
129 .iter()
130 .any(|r| r.as_cdp_str().eq_ignore_ascii_case(cdp_type))
131 }
132
133 /// Returns `true` if no resource types are blocked.
134 pub const fn is_empty(&self) -> bool {
135 self.blocked.is_empty()
136 }
137}
138
139// ─── WaitUntil ────────────────────────────────────────────────────────────────
140
141/// Condition to wait for after a navigation.
142///
143/// # Example
144///
145/// ```
146/// use stygian_browser::page::WaitUntil;
147/// let w = WaitUntil::Selector("#main".to_string());
148/// assert!(matches!(w, WaitUntil::Selector(_)));
149/// ```
150#[derive(Debug, Clone)]
151pub enum WaitUntil {
152 /// Wait for the `Page.domContentEventFired` CDP event — fires when the HTML
153 /// document has been fully parsed and the DOM is ready, before subresources
154 /// such as images and stylesheets finish loading.
155 DomContentLoaded,
156 /// Wait for the `Page.loadEventFired` CDP event **and** then wait until no
157 /// more than 2 network requests are in-flight for at least 500 ms
158 /// (equivalent to Playwright's `networkidle2`).
159 NetworkIdle,
160 /// Wait until `document.querySelector(selector)` returns a non-null element.
161 Selector(String),
162}
163
164// ─── NodeHandle ───────────────────────────────────────────────────────────────
165
166/// A handle to a live DOM node backed by a CDP `RemoteObjectId`.
167///
168/// Obtained via [`PageHandle::query_selector_all`]. Each method issues one or
169/// more CDP `Runtime.callFunctionOn` calls against the held V8 remote object
170/// reference — no HTML serialisation occurs.
171///
172/// A handle becomes **stale** after page navigation or if the underlying DOM
173/// node is removed. Stale calls return [`BrowserError::StaleNode`] so callers
174/// can distinguish them from other CDP failures.
175///
176/// # Example
177///
178/// ```no_run
179/// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
180/// use std::time::Duration;
181///
182/// # async fn run() -> stygian_browser::error::Result<()> {
183/// let pool = BrowserPool::new(BrowserConfig::default()).await?;
184/// let handle = pool.acquire().await?;
185/// let mut page = handle.browser().expect("valid browser").new_page().await?;
186/// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
187///
188/// for node in page.query_selector_all("a[href]").await? {
189/// let href = node.attr("href").await?;
190/// let text = node.text_content().await?;
191/// println!("{text}: {href:?}");
192/// }
193/// # Ok(())
194/// # }
195/// ```
196pub struct NodeHandle {
197 element: chromiumoxide::element::Element,
198 /// Original CSS selector — preserved for stale-node error messages only.
199 /// Shared via `Arc<str>` so all handles from a single query reuse the
200 /// same allocation rather than cloning a `String` per node.
201 selector: Arc<str>,
202 cdp_timeout: Duration,
203 /// Cloned page reference used only for document-level element resolution
204 /// during DOM traversal (parent / sibling navigation).
205 page: chromiumoxide::Page,
206}
207
208impl NodeHandle {
209 /// Return a single attribute value, or `None` if the attribute is absent.
210 ///
211 /// Issues one `Runtime.callFunctionOn` CDP call (`el.getAttribute(name)`).
212 ///
213 /// # Errors
214 ///
215 /// Returns [`BrowserError::StaleNode`] when the remote object has been
216 /// invalidated, or [`BrowserError::Timeout`] / [`BrowserError::CdpError`]
217 /// on transport-level failures.
218 pub async fn attr(&self, name: &str) -> Result<Option<String>> {
219 timeout(self.cdp_timeout, self.element.attribute(name))
220 .await
221 .map_err(|_| BrowserError::Timeout {
222 operation: "NodeHandle::attr".to_string(),
223 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
224 })?
225 .map_err(|e| self.cdp_err_or_stale(&e, "attr"))
226 }
227
228 /// Return all attributes as a `HashMap<name, value>` in a **single**
229 /// CDP round-trip.
230 ///
231 /// Uses `DOM.getAttributes` (via the chromiumoxide `attributes()` API)
232 /// which returns a flat `[name, value, name, value, …]` list from the node
233 /// description — no per-attribute calls are needed.
234 ///
235 /// # Errors
236 ///
237 /// Returns [`BrowserError::StaleNode`] when the remote object has been
238 /// invalidated.
239 pub async fn attr_map(&self) -> Result<HashMap<String, String>> {
240 let flat = timeout(self.cdp_timeout, self.element.attributes())
241 .await
242 .map_err(|_| BrowserError::Timeout {
243 operation: "NodeHandle::attr_map".to_string(),
244 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
245 })?
246 .map_err(|e| self.cdp_err_or_stale(&e, "attr_map"))?;
247
248 let mut map = HashMap::with_capacity(flat.len() / 2);
249 for pair in flat.chunks_exact(2) {
250 if let [name, value] = pair {
251 map.insert(name.clone(), value.clone());
252 }
253 }
254 Ok(map)
255 }
256
257 /// Return the element's `textContent` (all text inside, no markup).
258 ///
259 /// Reads the DOM `textContent` property via a single JS eval — this is the
260 /// raw text concatenation of all descendant text nodes, independent of
261 /// layout or visibility (unlike `innerText`).
262 ///
263 /// Returns an empty string when the property is absent or null.
264 ///
265 /// # Errors
266 ///
267 /// Returns [`BrowserError::StaleNode`] when the remote object has been
268 /// invalidated.
269 pub async fn text_content(&self) -> Result<String> {
270 let returns = timeout(
271 self.cdp_timeout,
272 self.element
273 .call_js_fn(r"function() { return this.textContent ?? ''; }", true),
274 )
275 .await
276 .map_err(|_| BrowserError::Timeout {
277 operation: "NodeHandle::text_content".to_string(),
278 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
279 })?
280 .map_err(|e| self.cdp_err_or_stale(&e, "text_content"))?;
281
282 Ok(returns
283 .result
284 .value
285 .as_ref()
286 .and_then(|v| v.as_str())
287 .unwrap_or("")
288 .to_string())
289 }
290
291 /// Return the element's `innerHTML`.
292 ///
293 /// Returns an empty string when the property is absent or null.
294 ///
295 /// # Errors
296 ///
297 /// Returns [`BrowserError::StaleNode`] when the remote object has been
298 /// invalidated.
299 pub async fn inner_html(&self) -> Result<String> {
300 timeout(self.cdp_timeout, self.element.inner_html())
301 .await
302 .map_err(|_| BrowserError::Timeout {
303 operation: "NodeHandle::inner_html".to_string(),
304 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
305 })?
306 .map_err(|e| self.cdp_err_or_stale(&e, "inner_html"))
307 .map(Option::unwrap_or_default)
308 }
309
310 /// Return the element's `outerHTML`.
311 ///
312 /// Returns an empty string when the property is absent or null.
313 ///
314 /// # Errors
315 ///
316 /// Returns [`BrowserError::StaleNode`] when the remote object has been
317 /// invalidated.
318 pub async fn outer_html(&self) -> Result<String> {
319 timeout(self.cdp_timeout, self.element.outer_html())
320 .await
321 .map_err(|_| BrowserError::Timeout {
322 operation: "NodeHandle::outer_html".to_string(),
323 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
324 })?
325 .map_err(|e| self.cdp_err_or_stale(&e, "outer_html"))
326 .map(Option::unwrap_or_default)
327 }
328
329 /// Return the ancestor tag-name chain, root-last.
330 ///
331 /// Executes a single `Runtime.callFunctionOn` JavaScript function that
332 /// walks `parentElement` and collects tag names — no repeated CDP calls.
333 ///
334 /// ```text
335 /// // for <span> inside <p> inside <article> inside <body> inside <html>
336 /// ["p", "article", "body", "html"]
337 /// ```
338 ///
339 /// # Errors
340 ///
341 /// Returns [`BrowserError::StaleNode`] when the remote object has been
342 /// invalidated, or [`BrowserError::ScriptExecutionFailed`] when CDP
343 /// returns no value or the value is not a string array.
344 pub async fn ancestors(&self) -> Result<Vec<String>> {
345 let returns = timeout(
346 self.cdp_timeout,
347 self.element.call_js_fn(
348 r"function() {
349 const a = [];
350 let n = this.parentElement;
351 while (n) { a.push(n.tagName.toLowerCase()); n = n.parentElement; }
352 return a;
353 }",
354 true,
355 ),
356 )
357 .await
358 .map_err(|_| BrowserError::Timeout {
359 operation: "NodeHandle::ancestors".to_string(),
360 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
361 })?
362 .map_err(|e| self.cdp_err_or_stale(&e, "ancestors"))?;
363
364 // With returnByValue=true and an array return, CDP delivers the value
365 // as a JSON array directly — no JSON.stringify/re-parse needed.
366 // A missing or wrong-type value indicates an unexpected CDP failure.
367 let arr = returns
368 .result
369 .value
370 .as_ref()
371 .and_then(|v| v.as_array())
372 .ok_or_else(|| BrowserError::ScriptExecutionFailed {
373 script: "NodeHandle::ancestors".to_string(),
374 reason: "CDP returned no value or a non-array value for ancestors()".to_string(),
375 })?;
376
377 arr.iter()
378 .map(|v| {
379 v.as_str().map(ToString::to_string).ok_or_else(|| {
380 BrowserError::ScriptExecutionFailed {
381 script: "NodeHandle::ancestors".to_string(),
382 reason: format!("ancestor entry is not a string: {v}"),
383 }
384 })
385 })
386 .collect()
387 }
388
389 /// Return child elements matching `selector` as new [`NodeHandle`]s.
390 ///
391 /// Issues a single `Runtime.callFunctionOn` + `DOM.querySelectorAll`
392 /// call scoped to this element — not to the entire document.
393 ///
394 /// Returns an empty `Vec` when no children match (consistent with the JS
395 /// `querySelectorAll` contract).
396 ///
397 /// # Errors
398 ///
399 /// Returns [`BrowserError::StaleNode`] when the remote object has been
400 /// invalidated, or [`BrowserError::CdpError`] on transport failure.
401 pub async fn children_matching(&self, selector: &str) -> Result<Vec<Self>> {
402 let elements = timeout(self.cdp_timeout, self.element.find_elements(selector))
403 .await
404 .map_err(|_| BrowserError::Timeout {
405 operation: "NodeHandle::children_matching".to_string(),
406 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
407 })?
408 .map_err(|e| self.cdp_err_or_stale(&e, "children_matching"))?;
409
410 let selector_arc: Arc<str> = Arc::from(selector);
411 Ok(elements
412 .into_iter()
413 .map(|el| Self {
414 element: el,
415 selector: selector_arc.clone(),
416 cdp_timeout: self.cdp_timeout,
417 page: self.page.clone(),
418 })
419 .collect())
420 }
421
422 /// Return the immediate parent element, or `None` if this element has no
423 /// parent (i.e. it is the document root).
424 ///
425 /// Issues a single `Runtime.callFunctionOn` CDP call that temporarily tags
426 /// the parent element with a unique attribute, then resolves it via a
427 /// document-level `DOM.querySelector` before removing the tag.
428 ///
429 /// # Errors
430 ///
431 /// Returns [`BrowserError::StaleNode`] when the remote object has been
432 /// invalidated.
433 ///
434 /// # Example
435 ///
436 /// ```no_run
437 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
438 /// use std::time::Duration;
439 ///
440 /// # async fn run() -> stygian_browser::error::Result<()> {
441 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
442 /// let handle = pool.acquire().await?;
443 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
444 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
445 /// let nodes = page.query_selector_all("p").await?;
446 /// if let Some(parent) = nodes[0].parent().await? {
447 /// let html = parent.outer_html().await?;
448 /// println!("parent: {}", &html[..html.len().min(80)]);
449 /// }
450 /// # Ok(())
451 /// # }
452 /// ```
453 pub async fn parent(&self) -> Result<Option<Self>> {
454 let attr = format!(
455 "data-stygian-t-{}",
456 ulid::Ulid::new().to_string().to_lowercase()
457 );
458 let js = format!(
459 "function() {{ \
460 var t = this.parentElement; \
461 if (!t) {{ return false; }} \
462 t.setAttribute('{attr}', '1'); \
463 return true; \
464 }}"
465 );
466 self.call_traversal(&js, &attr, "parent").await
467 }
468
469 /// Return the next element sibling, or `None` if this element is the last
470 /// child of its parent.
471 ///
472 /// Uses `nextElementSibling` (skips text/comment nodes).
473 ///
474 /// # Errors
475 ///
476 /// Returns [`BrowserError::StaleNode`] when the remote object has been
477 /// invalidated.
478 ///
479 /// # Example
480 ///
481 /// ```no_run
482 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
483 /// use std::time::Duration;
484 ///
485 /// # async fn run() -> stygian_browser::error::Result<()> {
486 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
487 /// let handle = pool.acquire().await?;
488 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
489 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
490 /// let nodes = page.query_selector_all("li").await?;
491 /// if let Some(next) = nodes[0].next_sibling().await? {
492 /// println!("next sibling: {}", next.text_content().await?);
493 /// }
494 /// # Ok(())
495 /// # }
496 /// ```
497 pub async fn next_sibling(&self) -> Result<Option<Self>> {
498 let attr = format!(
499 "data-stygian-t-{}",
500 ulid::Ulid::new().to_string().to_lowercase()
501 );
502 let js = format!(
503 "function() {{ \
504 var t = this.nextElementSibling; \
505 if (!t) {{ return false; }} \
506 t.setAttribute('{attr}', '1'); \
507 return true; \
508 }}"
509 );
510 self.call_traversal(&js, &attr, "next").await
511 }
512
513 /// Return the previous element sibling, or `None` if this element is the
514 /// first child of its parent.
515 ///
516 /// Uses `previousElementSibling` (skips text/comment nodes).
517 ///
518 /// # Errors
519 ///
520 /// Returns [`BrowserError::StaleNode`] when the remote object has been
521 /// invalidated.
522 ///
523 /// # Example
524 ///
525 /// ```no_run
526 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
527 /// use std::time::Duration;
528 ///
529 /// # async fn run() -> stygian_browser::error::Result<()> {
530 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
531 /// let handle = pool.acquire().await?;
532 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
533 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
534 /// let nodes = page.query_selector_all("li").await?;
535 /// if let Some(prev) = nodes[1].previous_sibling().await? {
536 /// println!("prev sibling: {}", prev.text_content().await?);
537 /// }
538 /// # Ok(())
539 /// # }
540 /// ```
541 pub async fn previous_sibling(&self) -> Result<Option<Self>> {
542 let attr = format!(
543 "data-stygian-t-{}",
544 ulid::Ulid::new().to_string().to_lowercase()
545 );
546 let js = format!(
547 "function() {{ \
548 var t = this.previousElementSibling; \
549 if (!t) {{ return false; }} \
550 t.setAttribute('{attr}', '1'); \
551 return true; \
552 }}"
553 );
554 self.call_traversal(&js, &attr, "prev").await
555 }
556
557 /// Shared traversal implementation used by [`parent`], [`next_sibling`],
558 /// and [`previous_sibling`].
559 ///
560 /// The caller provides a JS function that:
561 /// 1. Navigates to the target element (parent / sibling).
562 /// 2. If the target is non-null, sets a unique attribute (`attr_name`)
563 /// on it and returns `true`.
564 /// 3. Returns `false` when the target is null (no such neighbour).
565 ///
566 /// This helper then resolves the tagged element from the document root,
567 /// removes the temporary attribute, and wraps the result in a
568 /// `NodeHandle`.
569 ///
570 /// [`parent`]: Self::parent
571 /// [`next_sibling`]: Self::next_sibling
572 /// [`previous_sibling`]: Self::previous_sibling
573 async fn call_traversal(
574 &self,
575 js_fn: &str,
576 attr_name: &str,
577 selector_suffix: &str,
578 ) -> Result<Option<Self>> {
579 // Step 1: Run the JS that tags the target element and reports null/non-null.
580 let op_tag = format!("NodeHandle::{selector_suffix}::tag");
581 let returns = timeout(self.cdp_timeout, self.element.call_js_fn(js_fn, false))
582 .await
583 .map_err(|_| BrowserError::Timeout {
584 operation: op_tag.clone(),
585 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
586 })?
587 .map_err(|e| self.cdp_err_or_stale(&e, selector_suffix))?;
588
589 // JS returns false → no such neighbour.
590 let has_target = returns
591 .result
592 .value
593 .as_ref()
594 .and_then(serde_json::Value::as_bool)
595 .unwrap_or(false);
596 if !has_target {
597 return Ok(None);
598 }
599
600 // Step 2: Resolve the tagged element via a document-level querySelector.
601 let css = format!("[{attr_name}]");
602 let op_resolve = format!("NodeHandle::{selector_suffix}::resolve");
603 let element = timeout(self.cdp_timeout, self.page.find_element(css))
604 .await
605 .map_err(|_| BrowserError::Timeout {
606 operation: op_resolve.clone(),
607 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
608 })?
609 .map_err(|e| BrowserError::CdpError {
610 operation: op_resolve,
611 message: e.to_string(),
612 })?;
613
614 // Step 3: Remove the temporary attribute (best-effort; a failure here
615 // is non-fatal — it leaves a harmless stale attribute in the DOM).
616 let cleanup = format!("function() {{ this.removeAttribute('{attr_name}'); }}");
617 let _ = element.call_js_fn(cleanup, false).await;
618
619 // Step 4: Wrap in a NodeHandle with the diagnostic selector suffix.
620 let new_selector: Arc<str> =
621 Arc::from(format!("{}::{selector_suffix}", self.selector).as_str());
622 Ok(Some(Self {
623 element,
624 selector: new_selector,
625 cdp_timeout: self.cdp_timeout,
626 page: self.page.clone(),
627 }))
628 }
629
630 /// Map a chromiumoxide `CdpError` to either [`BrowserError::StaleNode`]
631 /// (when the remote object reference has been invalidated) or
632 /// [`BrowserError::CdpError`] for all other failures.
633 fn cdp_err_or_stale(
634 &self,
635 err: &chromiumoxide::error::CdpError,
636 operation: &str,
637 ) -> BrowserError {
638 let msg = err.to_string();
639 if msg.contains("Cannot find object with id")
640 || msg.contains("context with specified id")
641 || msg.contains("Cannot find context")
642 {
643 BrowserError::StaleNode {
644 selector: self.selector.to_string(),
645 }
646 } else {
647 BrowserError::CdpError {
648 operation: operation.to_string(),
649 message: msg,
650 }
651 }
652 }
653}
654
655// ─── PageHandle ───────────────────────────────────────────────────────────────
656
657/// A handle to an open browser tab.
658///
659/// On drop the underlying page is closed automatically.
660///
661/// # Example
662///
663/// ```no_run
664/// use stygian_browser::{BrowserPool, BrowserConfig};
665/// use stygian_browser::page::WaitUntil;
666/// use std::time::Duration;
667///
668/// # async fn run() -> stygian_browser::error::Result<()> {
669/// let pool = BrowserPool::new(BrowserConfig::default()).await?;
670/// let handle = pool.acquire().await?;
671/// let mut page = handle.browser().expect("valid browser").new_page().await?;
672/// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
673/// let html = page.content().await?;
674/// drop(page); // closes the tab
675/// handle.release().await;
676/// # Ok(())
677/// # }
678/// ```
679pub struct PageHandle {
680 page: Page,
681 cdp_timeout: Duration,
682 /// HTTP status code of the most recent main-frame navigation, or `0` if not
683 /// yet captured. Written atomically by the listener spawned in `navigate()`.
684 last_status_code: Arc<AtomicU16>,
685 /// Background task processing `Fetch.requestPaused` events. Aborted and
686 /// replaced each time `set_resource_filter` is called.
687 resource_filter_task: Option<tokio::task::JoinHandle<()>>,
688}
689
690impl PageHandle {
691 /// Wrap a raw chromiumoxide [`Page`] in a handle.
692 pub(crate) fn new(page: Page, cdp_timeout: Duration) -> Self {
693 Self {
694 page,
695 cdp_timeout,
696 last_status_code: Arc::new(AtomicU16::new(0)),
697 resource_filter_task: None,
698 }
699 }
700
701 /// Navigate to `url` and wait for `condition` within `nav_timeout`.
702 ///
703 /// # Errors
704 ///
705 /// Returns [`BrowserError::NavigationFailed`] if the navigation times out or
706 /// the CDP call fails.
707 pub async fn navigate(
708 &mut self,
709 url: &str,
710 condition: WaitUntil,
711 nav_timeout: Duration,
712 ) -> Result<()> {
713 self.setup_status_capture().await;
714 timeout(
715 nav_timeout,
716 self.navigate_inner(url, condition, nav_timeout),
717 )
718 .await
719 .map_err(|_| BrowserError::NavigationFailed {
720 url: url.to_string(),
721 reason: format!("navigation timed out after {nav_timeout:?}"),
722 })?
723 }
724
725 /// Reset the last status code and wire up the `Network.responseReceived`
726 /// listener before any navigation starts. Errors are logged and swallowed
727 /// so that a missing network domain never blocks navigation.
728 async fn setup_status_capture(&self) {
729 use chromiumoxide::cdp::browser_protocol::network::{
730 EventResponseReceived, ResourceType as NetworkResourceType,
731 };
732 use futures::StreamExt;
733
734 // Reset so a stale code is not returned if the new navigation fails
735 // before the response headers arrive.
736 self.last_status_code.store(0, Ordering::Release);
737
738 // Subscribe *before* goto() — the listener runs in a detached task and
739 // stores the first Document-type response status atomically.
740 let page_for_listener = self.page.clone();
741 let status_capture = Arc::clone(&self.last_status_code);
742 match page_for_listener
743 .event_listener::<EventResponseReceived>()
744 .await
745 {
746 Ok(mut stream) => {
747 tokio::spawn(async move {
748 while let Some(event) = stream.next().await {
749 if event.r#type == NetworkResourceType::Document {
750 let code = u16::try_from(event.response.status).unwrap_or(0);
751 if code > 0 {
752 status_capture.store(code, Ordering::Release);
753 }
754 break;
755 }
756 }
757 });
758 }
759 Err(e) => warn!("status-code capture unavailable: {e}"),
760 }
761 }
762
763 /// Subscribe to the appropriate CDP events, fire `goto`, then await
764 /// `condition`. All subscriptions precede `goto` to eliminate the race
765 /// described in issue #7.
766 async fn navigate_inner(
767 &self,
768 url: &str,
769 condition: WaitUntil,
770 nav_timeout: Duration,
771 ) -> Result<()> {
772 use chromiumoxide::cdp::browser_protocol::page::{
773 EventDomContentEventFired, EventLoadEventFired,
774 };
775 use futures::StreamExt;
776
777 let url_owned = url.to_string();
778
779 let mut dom_events = match &condition {
780 WaitUntil::DomContentLoaded => Some(
781 self.page
782 .event_listener::<EventDomContentEventFired>()
783 .await
784 .map_err(|e| BrowserError::NavigationFailed {
785 url: url_owned.clone(),
786 reason: e.to_string(),
787 })?,
788 ),
789 _ => None,
790 };
791
792 let mut load_events = match &condition {
793 WaitUntil::NetworkIdle => Some(
794 self.page
795 .event_listener::<EventLoadEventFired>()
796 .await
797 .map_err(|e| BrowserError::NavigationFailed {
798 url: url_owned.clone(),
799 reason: e.to_string(),
800 })?,
801 ),
802 _ => None,
803 };
804
805 let inflight = if matches!(condition, WaitUntil::NetworkIdle) {
806 Some(self.subscribe_inflight_counter().await)
807 } else {
808 None
809 };
810
811 self.page
812 .goto(url)
813 .await
814 .map_err(|e| BrowserError::NavigationFailed {
815 url: url_owned.clone(),
816 reason: e.to_string(),
817 })?;
818
819 match &condition {
820 WaitUntil::DomContentLoaded => {
821 if let Some(ref mut events) = dom_events {
822 let _ = events.next().await;
823 }
824 }
825 WaitUntil::NetworkIdle => {
826 if let Some(ref mut events) = load_events {
827 let _ = events.next().await;
828 }
829 if let Some(ref counter) = inflight {
830 Self::wait_network_idle(counter).await;
831 }
832 }
833 WaitUntil::Selector(css) => {
834 self.wait_for_selector(css, nav_timeout).await?;
835 }
836 }
837 Ok(())
838 }
839
840 /// Spawn three detached tasks that maintain a signed in-flight request
841 /// counter via `Network.requestWillBeSent` (+1) and
842 /// `Network.loadingFinished`/`Network.loadingFailed` (−1 each).
843 /// Returns the shared counter so the caller can poll it.
844 async fn subscribe_inflight_counter(&self) -> Arc<std::sync::atomic::AtomicI32> {
845 use std::sync::atomic::AtomicI32;
846
847 use chromiumoxide::cdp::browser_protocol::network::{
848 EventLoadingFailed, EventLoadingFinished, EventRequestWillBeSent,
849 };
850 use futures::StreamExt;
851
852 let counter: Arc<AtomicI32> = Arc::new(AtomicI32::new(0));
853 let pairs: [(Arc<AtomicI32>, i32); 3] = [
854 (Arc::clone(&counter), 1),
855 (Arc::clone(&counter), -1),
856 (Arc::clone(&counter), -1),
857 ];
858 let [p1, p2, p3] = [self.page.clone(), self.page.clone(), self.page.clone()];
859
860 macro_rules! spawn_tracker {
861 ($page:expr, $event:ty, $c:expr, $delta:expr) => {
862 match $page.event_listener::<$event>().await {
863 Ok(mut s) => {
864 let c = $c;
865 let d = $delta;
866 tokio::spawn(async move {
867 while s.next().await.is_some() {
868 c.fetch_add(d, Ordering::Relaxed);
869 }
870 });
871 }
872 Err(e) => warn!("network-idle tracker unavailable: {e}"),
873 }
874 };
875 }
876
877 let [(c1, d1), (c2, d2), (c3, d3)] = pairs;
878 spawn_tracker!(p1, EventRequestWillBeSent, c1, d1);
879 spawn_tracker!(p2, EventLoadingFinished, c2, d2);
880 spawn_tracker!(p3, EventLoadingFailed, c3, d3);
881
882 counter
883 }
884
885 /// Poll `counter` until ≤ 2 in-flight requests persist for 500 ms
886 /// (equivalent to Playwright's `networkidle2`).
887 async fn wait_network_idle(counter: &Arc<std::sync::atomic::AtomicI32>) {
888 const IDLE_THRESHOLD: i32 = 2;
889 const SETTLE: Duration = Duration::from_millis(500);
890 loop {
891 if counter.load(Ordering::Relaxed) <= IDLE_THRESHOLD {
892 tokio::time::sleep(SETTLE).await;
893 if counter.load(Ordering::Relaxed) <= IDLE_THRESHOLD {
894 break;
895 }
896 } else {
897 tokio::time::sleep(Duration::from_millis(50)).await;
898 }
899 }
900 }
901
902 /// Wait until `document.querySelector(selector)` is non-null (`timeout`).
903 ///
904 /// # Errors
905 ///
906 /// Returns [`BrowserError::NavigationFailed`] if the selector is not found
907 /// within the given timeout.
908 pub async fn wait_for_selector(&self, selector: &str, wait_timeout: Duration) -> Result<()> {
909 let selector_owned = selector.to_string();
910 let poll = async {
911 loop {
912 if self.page.find_element(selector_owned.clone()).await.is_ok() {
913 return Ok(());
914 }
915 tokio::time::sleep(Duration::from_millis(100)).await;
916 }
917 };
918
919 timeout(wait_timeout, poll)
920 .await
921 .map_err(|_| BrowserError::NavigationFailed {
922 url: String::new(),
923 reason: format!("selector '{selector_owned}' not found within {wait_timeout:?}"),
924 })?
925 }
926
927 /// Set a resource filter to block specific network request types.
928 ///
929 /// Enables `Fetch` interception and spawns a background task that continues
930 /// allowed requests and fails blocked ones with `BlockedByClient`. Any
931 /// previously set filter task is cancelled first.
932 ///
933 /// # Errors
934 ///
935 /// Returns a [`BrowserError::CdpError`] if the CDP call fails.
936 pub async fn set_resource_filter(&mut self, filter: ResourceFilter) -> Result<()> {
937 use chromiumoxide::cdp::browser_protocol::fetch::{
938 ContinueRequestParams, EnableParams, EventRequestPaused, FailRequestParams,
939 RequestPattern,
940 };
941 use chromiumoxide::cdp::browser_protocol::network::ErrorReason;
942 use futures::StreamExt as _;
943
944 if filter.is_empty() {
945 return Ok(());
946 }
947
948 // Cancel any previously running filter task.
949 if let Some(task) = self.resource_filter_task.take() {
950 task.abort();
951 }
952
953 let pattern = RequestPattern::builder().url_pattern("*").build();
954 let params = EnableParams::builder()
955 .patterns(vec![pattern])
956 .handle_auth_requests(false)
957 .build();
958
959 timeout(self.cdp_timeout, self.page.execute::<EnableParams>(params))
960 .await
961 .map_err(|_| BrowserError::Timeout {
962 operation: "Fetch.enable".to_string(),
963 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
964 })?
965 .map_err(|e| BrowserError::CdpError {
966 operation: "Fetch.enable".to_string(),
967 message: e.to_string(),
968 })?;
969
970 // Subscribe to requestPaused events and dispatch each one so navigation
971 // is never blocked. Without this handler Chrome holds every intercepted
972 // request indefinitely and the page hangs.
973 let mut events = self
974 .page
975 .event_listener::<EventRequestPaused>()
976 .await
977 .map_err(|e| BrowserError::CdpError {
978 operation: "Fetch.requestPaused subscribe".to_string(),
979 message: e.to_string(),
980 })?;
981
982 let page = self.page.clone();
983 debug!("Resource filter active: {:?}", filter);
984 let task = tokio::spawn(async move {
985 while let Some(event) = events.next().await {
986 let request_id = event.request_id.clone();
987 if filter.should_block(event.resource_type.as_ref()) {
988 let params = FailRequestParams::new(request_id, ErrorReason::BlockedByClient);
989 let _ = page.execute(params).await;
990 } else {
991 let _ = page.execute(ContinueRequestParams::new(request_id)).await;
992 }
993 }
994 });
995
996 self.resource_filter_task = Some(task);
997 Ok(())
998 }
999
1000 /// Return the current page URL (post-navigation, post-redirect).
1001 ///
1002 /// Delegates to the CDP `Target.getTargetInfo` binding already used
1003 /// internally by [`save_cookies`](Self::save_cookies); no extra network
1004 /// request is made. Returns an empty string if the URL is not yet set
1005 /// (e.g. on a blank tab before the first navigation).
1006 ///
1007 /// # Errors
1008 ///
1009 /// Returns [`BrowserError::CdpError`] if the underlying CDP call fails, or
1010 /// [`BrowserError::Timeout`] if it exceeds `cdp_timeout`.
1011 ///
1012 /// # Example
1013 ///
1014 /// ```no_run
1015 /// use stygian_browser::{BrowserPool, BrowserConfig};
1016 /// use stygian_browser::page::WaitUntil;
1017 /// use std::time::Duration;
1018 ///
1019 /// # async fn run() -> stygian_browser::error::Result<()> {
1020 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1021 /// let handle = pool.acquire().await?;
1022 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
1023 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
1024 /// let url = page.url().await?;
1025 /// println!("Final URL after redirects: {url}");
1026 /// # Ok(())
1027 /// # }
1028 /// ```
1029 pub async fn url(&self) -> Result<String> {
1030 timeout(self.cdp_timeout, self.page.url())
1031 .await
1032 .map_err(|_| BrowserError::Timeout {
1033 operation: "page.url".to_string(),
1034 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1035 })?
1036 .map_err(|e| BrowserError::CdpError {
1037 operation: "page.url".to_string(),
1038 message: e.to_string(),
1039 })
1040 .map(Option::unwrap_or_default)
1041 }
1042
1043 /// Return the HTTP status code of the most recent main-frame navigation.
1044 ///
1045 /// The status is captured from the `Network.responseReceived` CDP event
1046 /// wired up inside [`navigate`](Self::navigate), so it reflects the
1047 /// *final* response after any server-side redirects.
1048 ///
1049 /// Returns `None` if the status was not captured — for example on `file://`
1050 /// navigations, when [`navigate`](Self::navigate) has not yet been called,
1051 /// or if the network event subscription failed.
1052 ///
1053 /// # Errors
1054 ///
1055 /// This method is infallible; the `Result` wrapper is kept for API
1056 /// consistency with other `PageHandle` methods.
1057 ///
1058 /// # Example
1059 ///
1060 /// ```no_run
1061 /// use stygian_browser::{BrowserPool, BrowserConfig};
1062 /// use stygian_browser::page::WaitUntil;
1063 /// use std::time::Duration;
1064 ///
1065 /// # async fn run() -> stygian_browser::error::Result<()> {
1066 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1067 /// let handle = pool.acquire().await?;
1068 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
1069 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
1070 /// if let Some(code) = page.status_code()? {
1071 /// println!("HTTP {code}");
1072 /// }
1073 /// # Ok(())
1074 /// # }
1075 /// ```
1076 pub fn status_code(&self) -> Result<Option<u16>> {
1077 let code = self.last_status_code.load(Ordering::Acquire);
1078 Ok(if code == 0 { None } else { Some(code) })
1079 }
1080
1081 /// Return the page's `<title>` text.
1082 ///
1083 /// # Errors
1084 ///
1085 /// Returns [`BrowserError::ScriptExecutionFailed`] if the evaluation fails.
1086 pub async fn title(&self) -> Result<String> {
1087 timeout(self.cdp_timeout, self.page.get_title())
1088 .await
1089 .map_err(|_| BrowserError::Timeout {
1090 operation: "get_title".to_string(),
1091 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1092 })?
1093 .map_err(|e| BrowserError::ScriptExecutionFailed {
1094 script: "document.title".to_string(),
1095 reason: e.to_string(),
1096 })
1097 .map(Option::unwrap_or_default)
1098 }
1099
1100 /// Return the page's full outer HTML.
1101 ///
1102 /// # Errors
1103 ///
1104 /// Returns [`BrowserError::ScriptExecutionFailed`] if the evaluation fails.
1105 pub async fn content(&self) -> Result<String> {
1106 timeout(self.cdp_timeout, self.page.content())
1107 .await
1108 .map_err(|_| BrowserError::Timeout {
1109 operation: "page.content".to_string(),
1110 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1111 })?
1112 .map_err(|e| BrowserError::ScriptExecutionFailed {
1113 script: "document.documentElement.outerHTML".to_string(),
1114 reason: e.to_string(),
1115 })
1116 }
1117
1118 /// Query the live DOM for all elements matching `selector` and return
1119 /// lightweight [`NodeHandle`]s backed by CDP `RemoteObjectId`s.
1120 ///
1121 /// No HTML serialisation occurs — the browser's in-memory DOM is queried
1122 /// directly over the CDP connection, eliminating the `page.content()` +
1123 /// `scraper::Html::parse_document` round-trip.
1124 ///
1125 /// Returns an empty `Vec` when no elements match (consistent with the JS
1126 /// `querySelectorAll` contract — not an error).
1127 ///
1128 /// # Errors
1129 ///
1130 /// Returns [`BrowserError::CdpError`] if the CDP find call fails, or
1131 /// [`BrowserError::Timeout`] if it exceeds `cdp_timeout`.
1132 ///
1133 /// # Example
1134 ///
1135 /// ```no_run
1136 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
1137 /// use std::time::Duration;
1138 ///
1139 /// # async fn run() -> stygian_browser::error::Result<()> {
1140 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1141 /// let handle = pool.acquire().await?;
1142 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
1143 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
1144 ///
1145 /// let nodes = page.query_selector_all("[data-ux]").await?;
1146 /// for node in &nodes {
1147 /// let ux_type = node.attr("data-ux").await?;
1148 /// let text = node.text_content().await?;
1149 /// println!("{ux_type:?}: {text}");
1150 /// }
1151 /// # Ok(())
1152 /// # }
1153 /// ```
1154 pub async fn query_selector_all(&self, selector: &str) -> Result<Vec<NodeHandle>> {
1155 let elements = timeout(self.cdp_timeout, self.page.find_elements(selector))
1156 .await
1157 .map_err(|_| BrowserError::Timeout {
1158 operation: "PageHandle::query_selector_all".to_string(),
1159 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1160 })?
1161 .map_err(|e| BrowserError::CdpError {
1162 operation: "PageHandle::query_selector_all".to_string(),
1163 message: e.to_string(),
1164 })?;
1165
1166 let selector_arc: Arc<str> = Arc::from(selector);
1167 Ok(elements
1168 .into_iter()
1169 .map(|el| NodeHandle {
1170 element: el,
1171 selector: selector_arc.clone(),
1172 cdp_timeout: self.cdp_timeout,
1173 page: self.page.clone(),
1174 })
1175 .collect())
1176 }
1177
1178 /// Evaluate arbitrary JavaScript and return the result as `T`.
1179 ///
1180 /// # Errors
1181 ///
1182 /// Returns [`BrowserError::ScriptExecutionFailed`] on eval failure or
1183 /// deserialization error.
1184 pub async fn eval<T: serde::de::DeserializeOwned>(&self, script: &str) -> Result<T> {
1185 let script_owned = script.to_string();
1186 timeout(self.cdp_timeout, self.page.evaluate(script))
1187 .await
1188 .map_err(|_| BrowserError::Timeout {
1189 operation: "page.evaluate".to_string(),
1190 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1191 })?
1192 .map_err(|e| BrowserError::ScriptExecutionFailed {
1193 script: script_owned.clone(),
1194 reason: e.to_string(),
1195 })?
1196 .into_value::<T>()
1197 .map_err(|e| BrowserError::ScriptExecutionFailed {
1198 script: script_owned,
1199 reason: e.to_string(),
1200 })
1201 }
1202
1203 /// Save all cookies for the current page's origin.
1204 ///
1205 /// # Errors
1206 ///
1207 /// Returns [`BrowserError::CdpError`] if the CDP call fails.
1208 pub async fn save_cookies(
1209 &self,
1210 ) -> Result<Vec<chromiumoxide::cdp::browser_protocol::network::Cookie>> {
1211 use chromiumoxide::cdp::browser_protocol::network::GetCookiesParams;
1212
1213 let url = self
1214 .page
1215 .url()
1216 .await
1217 .map_err(|e| BrowserError::CdpError {
1218 operation: "page.url".to_string(),
1219 message: e.to_string(),
1220 })?
1221 .unwrap_or_default();
1222
1223 timeout(
1224 self.cdp_timeout,
1225 self.page
1226 .execute(GetCookiesParams::builder().urls(vec![url]).build()),
1227 )
1228 .await
1229 .map_err(|_| BrowserError::Timeout {
1230 operation: "Network.getCookies".to_string(),
1231 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1232 })?
1233 .map_err(|e| BrowserError::CdpError {
1234 operation: "Network.getCookies".to_string(),
1235 message: e.to_string(),
1236 })
1237 .map(|r| r.cookies.clone())
1238 }
1239
1240 /// Inject cookies into the current page.
1241 ///
1242 /// Seeds session tokens or other state without needing a full
1243 /// [`SessionSnapshot`][crate::session::SessionSnapshot] and without
1244 /// requiring a direct `chromiumoxide` dependency in calling code.
1245 ///
1246 /// Individual cookie failures are logged as warnings and do not abort the
1247 /// remaining cookies.
1248 ///
1249 /// # Errors
1250 ///
1251 /// Returns [`BrowserError::Timeout`] if a single `Network.setCookie` CDP
1252 /// call exceeds `cdp_timeout`.
1253 ///
1254 /// # Example
1255 ///
1256 /// ```no_run
1257 /// use stygian_browser::{BrowserPool, BrowserConfig};
1258 /// use stygian_browser::session::SessionCookie;
1259 /// use std::time::Duration;
1260 ///
1261 /// # async fn run() -> stygian_browser::error::Result<()> {
1262 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1263 /// let handle = pool.acquire().await?;
1264 /// let page = handle.browser().expect("valid browser").new_page().await?;
1265 /// let cookies = vec![SessionCookie {
1266 /// name: "session".to_string(),
1267 /// value: "abc123".to_string(),
1268 /// domain: ".example.com".to_string(),
1269 /// path: "/".to_string(),
1270 /// expires: -1.0,
1271 /// http_only: true,
1272 /// secure: true,
1273 /// same_site: "Lax".to_string(),
1274 /// }];
1275 /// page.inject_cookies(&cookies).await?;
1276 /// # Ok(())
1277 /// # }
1278 /// ```
1279 pub async fn inject_cookies(&self, cookies: &[crate::session::SessionCookie]) -> Result<()> {
1280 use chromiumoxide::cdp::browser_protocol::network::SetCookieParams;
1281
1282 for cookie in cookies {
1283 let params = match SetCookieParams::builder()
1284 .name(cookie.name.clone())
1285 .value(cookie.value.clone())
1286 .domain(cookie.domain.clone())
1287 .path(cookie.path.clone())
1288 .http_only(cookie.http_only)
1289 .secure(cookie.secure)
1290 .build()
1291 {
1292 Ok(p) => p,
1293 Err(e) => {
1294 warn!(cookie = %cookie.name, error = %e, "Failed to build cookie params");
1295 continue;
1296 }
1297 };
1298
1299 match timeout(self.cdp_timeout, self.page.execute(params)).await {
1300 Err(_) => {
1301 warn!(
1302 cookie = %cookie.name,
1303 timeout_ms = self.cdp_timeout.as_millis(),
1304 "Timed out injecting cookie"
1305 );
1306 }
1307 Ok(Err(e)) => {
1308 warn!(cookie = %cookie.name, error = %e, "Failed to inject cookie");
1309 }
1310 Ok(Ok(_)) => {}
1311 }
1312 }
1313
1314 debug!(count = cookies.len(), "Cookies injected");
1315 Ok(())
1316 }
1317
1318 /// Capture a screenshot of the current page as PNG bytes.
1319 ///
1320 /// The screenshot is full-page by default (viewport clipped to the rendered
1321 /// layout area). Save the returned bytes to a `.png` file or process
1322 /// them in-memory.
1323 ///
1324 /// # Errors
1325 ///
1326 /// Returns [`BrowserError::CdpError`] if the CDP `Page.captureScreenshot`
1327 /// command fails, or [`BrowserError::Timeout`] if it exceeds
1328 /// `cdp_timeout`.
1329 ///
1330 /// # Example
1331 ///
1332 /// ```no_run
1333 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
1334 /// use std::{time::Duration, fs};
1335 ///
1336 /// # async fn run() -> stygian_browser::error::Result<()> {
1337 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1338 /// let handle = pool.acquire().await?;
1339 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
1340 /// page.navigate("https://example.com", WaitUntil::Selector("body".to_string()), Duration::from_secs(30)).await?;
1341 /// let png = page.screenshot().await?;
1342 /// fs::write("screenshot.png", &png).unwrap();
1343 /// # Ok(())
1344 /// # }
1345 /// ```
1346 pub async fn screenshot(&self) -> Result<Vec<u8>> {
1347 use chromiumoxide::page::ScreenshotParams;
1348
1349 let params = ScreenshotParams::builder().full_page(true).build();
1350
1351 timeout(self.cdp_timeout, self.page.screenshot(params))
1352 .await
1353 .map_err(|_| BrowserError::Timeout {
1354 operation: "Page.captureScreenshot".to_string(),
1355 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1356 })?
1357 .map_err(|e| BrowserError::CdpError {
1358 operation: "Page.captureScreenshot".to_string(),
1359 message: e.to_string(),
1360 })
1361 }
1362
1363 /// Borrow the underlying chromiumoxide [`Page`].
1364 pub const fn inner(&self) -> &Page {
1365 &self.page
1366 }
1367
1368 /// Close this page (tab).
1369 ///
1370 /// Called automatically on drop; explicit call avoids suppressing the error.
1371 pub async fn close(self) -> Result<()> {
1372 timeout(Duration::from_secs(5), self.page.clone().close())
1373 .await
1374 .map_err(|_| BrowserError::Timeout {
1375 operation: "page.close".to_string(),
1376 duration_ms: 5000,
1377 })?
1378 .map_err(|e| BrowserError::CdpError {
1379 operation: "page.close".to_string(),
1380 message: e.to_string(),
1381 })
1382 }
1383}
1384
1385// ─── Stealth diagnostics ──────────────────────────────────────────────────────
1386
1387#[cfg(feature = "stealth")]
1388impl PageHandle {
1389 /// Run all built-in stealth detection checks against the current page.
1390 ///
1391 /// Iterates [`crate::diagnostic::all_checks`], evaluates each check's
1392 /// JavaScript via CDP `Runtime.evaluate`, and returns an aggregate
1393 /// [`crate::diagnostic::DiagnosticReport`].
1394 ///
1395 /// Failed scripts (due to JS exceptions or deserialization errors) are
1396 /// recorded as failing checks and do **not** abort the whole run.
1397 ///
1398 /// # Errors
1399 ///
1400 /// Returns an error only if the underlying CDP transport fails entirely.
1401 /// Individual check failures are captured in the report.
1402 ///
1403 /// # Example
1404 ///
1405 /// ```no_run
1406 /// # async fn run() -> stygian_browser::error::Result<()> {
1407 /// use stygian_browser::{BrowserPool, BrowserConfig};
1408 /// use stygian_browser::page::WaitUntil;
1409 /// use std::time::Duration;
1410 ///
1411 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1412 /// let handle = pool.acquire().await?;
1413 /// let browser = handle.browser().expect("valid browser");
1414 /// let mut page = browser.new_page().await?;
1415 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(10)).await?;
1416 ///
1417 /// let report = page.verify_stealth().await?;
1418 /// println!("Stealth: {}/{} checks passed", report.passed_count, report.checks.len());
1419 /// for failure in report.failures() {
1420 /// eprintln!(" FAIL {}: {}", failure.description, failure.details);
1421 /// }
1422 /// # Ok(())
1423 /// # }
1424 /// ```
1425 pub async fn verify_stealth(&self) -> Result<crate::diagnostic::DiagnosticReport> {
1426 use crate::diagnostic::{CheckResult, DiagnosticReport, all_checks};
1427
1428 let mut results: Vec<CheckResult> = Vec::new();
1429
1430 for check in all_checks() {
1431 let result = match self.eval::<String>(check.script).await {
1432 Ok(json) => check.parse_output(&json),
1433 Err(e) => {
1434 tracing::warn!(
1435 check = ?check.id,
1436 error = %e,
1437 "stealth check script failed during evaluation"
1438 );
1439 CheckResult {
1440 id: check.id,
1441 description: check.description.to_string(),
1442 passed: false,
1443 details: format!("script error: {e}"),
1444 }
1445 }
1446 };
1447 tracing::debug!(
1448 check = ?result.id,
1449 passed = result.passed,
1450 details = %result.details,
1451 "stealth check result"
1452 );
1453 results.push(result);
1454 }
1455
1456 Ok(DiagnosticReport::new(results))
1457 }
1458}
1459
1460// ─── extract feature ─────────────────────────────────────────────────────────
1461
1462#[cfg(feature = "extract")]
1463impl PageHandle {
1464 /// Extract a typed collection of `T` from all elements matching `selector`.
1465 ///
1466 /// Each matched element becomes the root node for `T::extract_from`.
1467 /// Returns an empty `Vec` when no elements match (consistent with the
1468 /// `querySelectorAll` contract — not an error).
1469 ///
1470 /// All per-node extractions are driven concurrently via
1471 /// [`futures::future::try_join_all`].
1472 ///
1473 /// # Errors
1474 ///
1475 /// Returns [`BrowserError::CdpError`] if the initial `query_selector_all`
1476 /// fails, or [`BrowserError::ExtractionFailed`] if any field extraction
1477 /// fails.
1478 ///
1479 /// # Example
1480 ///
1481 /// ```no_run
1482 /// use stygian_browser::extract::Extract;
1483 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
1484 /// use std::time::Duration;
1485 ///
1486 /// #[derive(Extract)]
1487 /// struct Link {
1488 /// #[selector("a", attr = "href")]
1489 /// href: Option<String>,
1490 /// }
1491 ///
1492 /// # async fn run() -> stygian_browser::error::Result<()> {
1493 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1494 /// let handle = pool.acquire().await?;
1495 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
1496 /// page.navigate(
1497 /// "https://example.com",
1498 /// WaitUntil::DomContentLoaded,
1499 /// Duration::from_secs(30),
1500 /// ).await?;
1501 /// let links: Vec<Link> = page.extract_all::<Link>("nav li").await?;
1502 /// # Ok(())
1503 /// # }
1504 /// ```
1505 pub async fn extract_all<T>(&self, selector: &str) -> Result<Vec<T>>
1506 where
1507 T: crate::extract::Extractable,
1508 {
1509 use futures::future::try_join_all;
1510
1511 let nodes = self.query_selector_all(selector).await?;
1512 try_join_all(nodes.iter().map(|n| T::extract_from(n)))
1513 .await
1514 .map_err(BrowserError::ExtractionFailed)
1515 }
1516}
1517
1518// ─── similarity feature ──────────────────────────────────────────────────────
1519
1520#[cfg(feature = "similarity")]
1521impl NodeHandle {
1522 /// Compute a structural [`crate::similarity::ElementFingerprint`] for this
1523 /// node.
1524 ///
1525 /// Issues a single `Runtime.callFunctionOn` JS eval that extracts the tag,
1526 /// class list, attribute names, and body-depth in one round-trip.
1527 ///
1528 /// # Errors
1529 ///
1530 /// Returns [`BrowserError::StaleNode`] when the remote object has been
1531 /// invalidated, or [`BrowserError::ScriptExecutionFailed`] if the script
1532 /// produces unexpected output.
1533 pub async fn fingerprint(&self) -> Result<crate::similarity::ElementFingerprint> {
1534 const JS: &str = r"function() {
1535 var el = this;
1536 var tag = el.tagName.toLowerCase();
1537 var classes = Array.prototype.slice.call(el.classList).sort();
1538 var attrNames = Array.prototype.slice.call(el.attributes)
1539 .map(function(a) { return a.name; })
1540 .filter(function(n) { return n !== 'class' && n !== 'id'; })
1541 .sort();
1542 var depth = 0;
1543 var n = el.parentElement;
1544 while (n && n.tagName.toLowerCase() !== 'body') { depth++; n = n.parentElement; }
1545 return JSON.stringify({ tag: tag, classes: classes, attrNames: attrNames, depth: depth });
1546}";
1547
1548 let returns = tokio::time::timeout(self.cdp_timeout, self.element.call_js_fn(JS, true))
1549 .await
1550 .map_err(|_| BrowserError::Timeout {
1551 operation: "NodeHandle::fingerprint".to_string(),
1552 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1553 })?
1554 .map_err(|e| self.cdp_err_or_stale(&e, "fingerprint"))?;
1555
1556 let json_str = returns
1557 .result
1558 .value
1559 .as_ref()
1560 .and_then(|v| v.as_str())
1561 .ok_or_else(|| BrowserError::ScriptExecutionFailed {
1562 script: "NodeHandle::fingerprint".to_string(),
1563 reason: "CDP returned no string value from fingerprint script".to_string(),
1564 })?;
1565
1566 serde_json::from_str::<crate::similarity::ElementFingerprint>(json_str).map_err(|e| {
1567 BrowserError::ScriptExecutionFailed {
1568 script: "NodeHandle::fingerprint".to_string(),
1569 reason: format!("failed to deserialise fingerprint JSON: {e}"),
1570 }
1571 })
1572 }
1573}
1574
1575#[cfg(feature = "similarity")]
1576impl PageHandle {
1577 /// Find all elements in the current page that are structurally similar to
1578 /// `reference`, scored by [`crate::similarity::SimilarityConfig`].
1579 ///
1580 /// Computes a structural fingerprint for `reference` (via
1581 /// [`NodeHandle::fingerprint`]), then fingerprints every candidate returned
1582 /// by `document.querySelectorAll("*")` and collects those whose
1583 /// [`crate::similarity::jaccard_weighted`] score exceeds
1584 /// `config.threshold`. Results are ordered by score descending.
1585 ///
1586 /// # Example
1587 ///
1588 /// ```no_run
1589 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
1590 /// use stygian_browser::similarity::SimilarityConfig;
1591 /// use std::time::Duration;
1592 ///
1593 /// # async fn run() -> stygian_browser::error::Result<()> {
1594 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1595 /// let handle = pool.acquire().await?;
1596 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
1597 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
1598 ///
1599 /// let nodes = page.query_selector_all(".price").await?;
1600 /// if let Some(reference) = nodes.into_iter().next() {
1601 /// let similar = page.find_similar(&reference, SimilarityConfig::default()).await?;
1602 /// for m in &similar {
1603 /// println!("score={:.2}", m.score);
1604 /// }
1605 /// }
1606 /// # Ok(())
1607 /// # }
1608 /// ```
1609 ///
1610 /// # Errors
1611 ///
1612 /// Returns [`BrowserError::StaleNode`] when `reference` is invalid, or
1613 /// [`BrowserError::ScriptExecutionFailed`] if a scoring script fails.
1614 pub async fn find_similar(
1615 &self,
1616 reference: &NodeHandle,
1617 config: crate::similarity::SimilarityConfig,
1618 ) -> Result<Vec<crate::similarity::SimilarMatch>> {
1619 use crate::similarity::{SimilarMatch, jaccard_weighted};
1620
1621 let ref_fp = reference.fingerprint().await?;
1622 let candidates = self.query_selector_all("*").await?;
1623
1624 let mut matches: Vec<SimilarMatch> = Vec::new();
1625 for node in candidates {
1626 if let Ok(cand_fp) = node.fingerprint().await {
1627 let score = jaccard_weighted(&ref_fp, &cand_fp);
1628 if score >= config.threshold {
1629 matches.push(SimilarMatch { node, score });
1630 }
1631 }
1632 // Stale / detached nodes are silently skipped.
1633 }
1634
1635 matches.sort_by(|a, b| {
1636 b.score
1637 .partial_cmp(&a.score)
1638 .unwrap_or(std::cmp::Ordering::Equal)
1639 });
1640
1641 if config.max_results > 0 {
1642 matches.truncate(config.max_results);
1643 }
1644
1645 Ok(matches)
1646 }
1647}
1648
1649impl Drop for PageHandle {
1650 fn drop(&mut self) {
1651 warn!("PageHandle dropped without explicit close(); spawning cleanup task");
1652 // chromiumoxide Page does not implement close on Drop, so we spawn
1653 // a fire-and-forget task. The page ref is already owned; we need to
1654 // swap it out. We clone the Page handle (it's Arc-backed internally).
1655 let page = self.page.clone();
1656 tokio::spawn(async move {
1657 let _ = page.close().await;
1658 });
1659 }
1660}
1661
1662// ─── Tests ────────────────────────────────────────────────────────────────────
1663
1664#[cfg(test)]
1665mod tests {
1666 use super::*;
1667
1668 #[test]
1669 fn resource_filter_block_media_blocks_image() {
1670 let filter = ResourceFilter::block_media();
1671 assert!(filter.should_block("Image"));
1672 assert!(filter.should_block("Font"));
1673 assert!(filter.should_block("Stylesheet"));
1674 assert!(filter.should_block("Media"));
1675 assert!(!filter.should_block("Script"));
1676 assert!(!filter.should_block("XHR"));
1677 }
1678
1679 #[test]
1680 fn resource_filter_case_insensitive() {
1681 let filter = ResourceFilter::block_images_and_fonts();
1682 assert!(filter.should_block("image")); // lowercase
1683 assert!(filter.should_block("IMAGE")); // uppercase
1684 assert!(!filter.should_block("Stylesheet"));
1685 }
1686
1687 #[test]
1688 fn resource_filter_builder_chain() {
1689 let filter = ResourceFilter::default()
1690 .block(ResourceType::Image)
1691 .block(ResourceType::Font);
1692 assert!(filter.should_block("Image"));
1693 assert!(filter.should_block("Font"));
1694 assert!(!filter.should_block("Stylesheet"));
1695 }
1696
1697 #[test]
1698 fn resource_filter_dedup_block() {
1699 let filter = ResourceFilter::default()
1700 .block(ResourceType::Image)
1701 .block(ResourceType::Image); // duplicate
1702 assert_eq!(filter.blocked.len(), 1);
1703 }
1704
1705 #[test]
1706 fn resource_filter_is_empty_when_default() {
1707 assert!(ResourceFilter::default().is_empty());
1708 assert!(!ResourceFilter::block_media().is_empty());
1709 }
1710
1711 #[test]
1712 fn wait_until_selector_stores_string() {
1713 let w = WaitUntil::Selector("#foo".to_string());
1714 assert!(matches!(w, WaitUntil::Selector(ref s) if s == "#foo"));
1715 }
1716
1717 #[test]
1718 fn resource_type_cdp_str() {
1719 assert_eq!(ResourceType::Image.as_cdp_str(), "Image");
1720 assert_eq!(ResourceType::Font.as_cdp_str(), "Font");
1721 assert_eq!(ResourceType::Stylesheet.as_cdp_str(), "Stylesheet");
1722 assert_eq!(ResourceType::Media.as_cdp_str(), "Media");
1723 }
1724
1725 /// `PageHandle` must be `Send + Sync` for use across thread boundaries.
1726 #[test]
1727 fn page_handle_is_send_sync() {
1728 fn assert_send<T: Send>() {}
1729 fn assert_sync<T: Sync>() {}
1730 assert_send::<PageHandle>();
1731 assert_sync::<PageHandle>();
1732 }
1733
1734 /// The status-code sentinel (0 = "not yet captured") and the conversion to
1735 /// `Option<u16>` are pure-logic invariants testable without a live browser.
1736 #[test]
1737 fn status_code_sentinel_zero_maps_to_none() {
1738 use std::sync::atomic::{AtomicU16, Ordering};
1739 let atom = AtomicU16::new(0);
1740 let code = atom.load(Ordering::Acquire);
1741 assert_eq!(if code == 0 { None } else { Some(code) }, None::<u16>);
1742 }
1743
1744 #[test]
1745 fn status_code_non_zero_maps_to_some() {
1746 use std::sync::atomic::{AtomicU16, Ordering};
1747 for &expected in &[200u16, 301, 404, 503] {
1748 let atom = AtomicU16::new(expected);
1749 let code = atom.load(Ordering::Acquire);
1750 assert_eq!(if code == 0 { None } else { Some(code) }, Some(expected));
1751 }
1752 }
1753
1754 // ── NodeHandle pure-logic tests ───────────────────────────────────────────
1755
1756 /// `attr_map` relies on `chunks_exact(2)` — verify the pairing logic is
1757 /// correct without a live browser by exercising it directly.
1758 #[test]
1759 fn attr_map_chunking_pairs_correctly() {
1760 let flat = [
1761 "id".to_string(),
1762 "main".to_string(),
1763 "data-ux".to_string(),
1764 "Section".to_string(),
1765 "class".to_string(),
1766 "container".to_string(),
1767 ];
1768 let mut map = std::collections::HashMap::with_capacity(flat.len() / 2);
1769 for pair in flat.chunks_exact(2) {
1770 if let [name, value] = pair {
1771 map.insert(name.clone(), value.clone());
1772 }
1773 }
1774 assert_eq!(map.get("id").map(String::as_str), Some("main"));
1775 assert_eq!(map.get("data-ux").map(String::as_str), Some("Section"));
1776 assert_eq!(map.get("class").map(String::as_str), Some("container"));
1777 assert_eq!(map.len(), 3);
1778 }
1779
1780 /// Odd-length flat attribute lists (malformed CDP response) are handled
1781 /// gracefully — the trailing element is silently ignored.
1782 #[test]
1783 fn attr_map_chunking_ignores_odd_trailing() {
1784 let flat = ["orphan".to_string()]; // no value
1785 let mut map = std::collections::HashMap::new();
1786 for pair in flat.chunks_exact(2) {
1787 if let [name, value] = pair {
1788 map.insert(name.clone(), value.clone());
1789 }
1790 }
1791 assert!(map.is_empty());
1792 }
1793
1794 /// Empty flat list → empty map.
1795 #[test]
1796 fn attr_map_chunking_empty_input() {
1797 let flat: Vec<String> = vec![];
1798 let map: std::collections::HashMap<String, String> = flat
1799 .chunks_exact(2)
1800 .filter_map(|pair| {
1801 if let [name, value] = pair {
1802 Some((name.clone(), value.clone()))
1803 } else {
1804 None
1805 }
1806 })
1807 .collect();
1808 assert!(map.is_empty());
1809 }
1810
1811 /// `ancestors` JSON parsing: valid input round-trips correctly.
1812 #[test]
1813 fn ancestors_json_parse_round_trip() -> std::result::Result<(), serde_json::Error> {
1814 let json = r#"["p","article","body","html"]"#;
1815 let result: Vec<String> = serde_json::from_str(json)?;
1816 assert_eq!(result, ["p", "article", "body", "html"]);
1817 Ok(())
1818 }
1819
1820 /// `ancestors` JSON parsing: empty array (no parent) is fine.
1821 #[test]
1822 fn ancestors_json_parse_empty() -> std::result::Result<(), serde_json::Error> {
1823 let json = "[]";
1824 let result: Vec<String> = serde_json::from_str(json)?;
1825 assert!(result.is_empty());
1826 Ok(())
1827 }
1828
1829 // ── Traversal selector suffix tests ──────────────────────────────────────
1830
1831 /// A `StaleNode` error whose selector includes a traversal suffix (e.g.
1832 /// `"div::parent"`) must surface that suffix in its `Display` output so
1833 /// callers can locate the failed traversal in logs.
1834 #[test]
1835 fn traversal_selector_suffix_in_stale_error() {
1836 let e = crate::error::BrowserError::StaleNode {
1837 selector: "div::parent".to_string(),
1838 };
1839 let msg = e.to_string();
1840 assert!(
1841 msg.contains("div::parent"),
1842 "StaleNode display must include the full selector; got: {msg}"
1843 );
1844 }
1845
1846 /// Same check for the `::next` suffix produced by `next_sibling()`.
1847 #[test]
1848 fn traversal_next_suffix_in_stale_error() {
1849 let e = crate::error::BrowserError::StaleNode {
1850 selector: "li.price::next".to_string(),
1851 };
1852 assert!(e.to_string().contains("li.price::next"));
1853 }
1854
1855 /// Same check for the `::prev` suffix produced by `previous_sibling()`.
1856 #[test]
1857 fn traversal_prev_suffix_in_stale_error() {
1858 let e = crate::error::BrowserError::StaleNode {
1859 selector: "td.label::prev".to_string(),
1860 };
1861 assert!(e.to_string().contains("td.label::prev"));
1862 }
1863}