Skip to main content

stygian_browser/
page.rs

1//! Page and browsing context management for isolated, parallel scraping
2//!
3//! Each `BrowserContext` (future) is an incognito-style isolation boundary (separate
4//! cookies, localStorage, cache).  Each context can contain many [`PageHandle`]s
5//! (tabs).  Both types clean up their CDP resources automatically on drop.
6//!
7//! ## Resource blocking
8//!
9//! Pass a [`ResourceFilter`] to [`PageHandle::set_resource_filter`] to intercept
10//! and block specific request types (images, fonts, CSS) before page load —
11//! significantly reducing page load times for text-only scraping.
12//!
13//! ## Wait strategies
14//!
15//! [`PageHandle`] exposes three wait strategies via [`WaitUntil`]:
16//! - `DomContentLoaded` — fires when the HTML is parsed
17//! - `NetworkIdle` — fires when there are ≤2 in-flight requests for 500 ms
18//! - `Selector(css)` — fires when a CSS selector matches an element
19//!
20//! # Example
21//!
22//! ```no_run
23//! use stygian_browser::{BrowserPool, BrowserConfig};
24//! use stygian_browser::page::{ResourceFilter, WaitUntil};
25//! use std::time::Duration;
26//!
27//! # async fn run() -> stygian_browser::error::Result<()> {
28//! let pool = BrowserPool::new(BrowserConfig::default()).await?;
29//! let handle = pool.acquire().await?;
30//!
31//! let mut page = handle.browser().expect("valid browser").new_page().await?;
32//! page.set_resource_filter(ResourceFilter::block_media()).await?;
33//! page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
34//! let title = page.title().await?;
35//! println!("title: {title}");
36//! handle.release().await;
37//! # Ok(())
38//! # }
39//! ```
40
41use std::collections::HashMap;
42use std::sync::{
43    Arc,
44    atomic::{AtomicU16, Ordering},
45};
46use std::time::Duration;
47
48use chromiumoxide::Page;
49use tokio::time::timeout;
50use tracing::{debug, warn};
51
52use crate::error::{BrowserError, Result};
53
54// ─── ResourceType ─────────────────────────────────────────────────────────────
55
56/// CDP resource types that can be intercepted.
57#[derive(Debug, Clone, PartialEq, Eq)]
58pub enum ResourceType {
59    /// `<img>`, `<picture>`, background images
60    Image,
61    /// Web fonts loaded via CSS `@font-face`
62    Font,
63    /// External CSS stylesheets
64    Stylesheet,
65    /// Media files (audio/video)
66    Media,
67}
68
69impl ResourceType {
70    /// Returns the string used in CDP `Network.requestIntercepted` events.
71    pub const fn as_cdp_str(&self) -> &'static str {
72        match self {
73            Self::Image => "Image",
74            Self::Font => "Font",
75            Self::Stylesheet => "Stylesheet",
76            Self::Media => "Media",
77        }
78    }
79}
80
81// ─── ResourceFilter ───────────────────────────────────────────────────────────
82
83/// Set of resource types to block from loading.
84///
85/// # Example
86///
87/// ```
88/// use stygian_browser::page::ResourceFilter;
89/// let filter = ResourceFilter::block_media();
90/// assert!(filter.should_block("Image"));
91/// ```
92#[derive(Debug, Clone, Default)]
93pub struct ResourceFilter {
94    blocked: Vec<ResourceType>,
95}
96
97impl ResourceFilter {
98    /// Block all media resources (images, fonts, CSS, audio/video).
99    pub fn block_media() -> Self {
100        Self {
101            blocked: vec![
102                ResourceType::Image,
103                ResourceType::Font,
104                ResourceType::Stylesheet,
105                ResourceType::Media,
106            ],
107        }
108    }
109
110    /// Block only images and fonts (keep styles for layout-sensitive work).
111    pub fn block_images_and_fonts() -> Self {
112        Self {
113            blocked: vec![ResourceType::Image, ResourceType::Font],
114        }
115    }
116
117    /// Add a resource type to the block list.
118    #[must_use]
119    pub fn block(mut self, resource: ResourceType) -> Self {
120        if !self.blocked.contains(&resource) {
121            self.blocked.push(resource);
122        }
123        self
124    }
125
126    /// Returns `true` if the given CDP resource type string should be blocked.
127    pub fn should_block(&self, cdp_type: &str) -> bool {
128        self.blocked
129            .iter()
130            .any(|r| r.as_cdp_str().eq_ignore_ascii_case(cdp_type))
131    }
132
133    /// Returns `true` if no resource types are blocked.
134    pub const fn is_empty(&self) -> bool {
135        self.blocked.is_empty()
136    }
137}
138
139// ─── WaitUntil ────────────────────────────────────────────────────────────────
140
141/// Condition to wait for after a navigation.
142///
143/// # Example
144///
145/// ```
146/// use stygian_browser::page::WaitUntil;
147/// let w = WaitUntil::Selector("#main".to_string());
148/// assert!(matches!(w, WaitUntil::Selector(_)));
149/// ```
150#[derive(Debug, Clone)]
151pub enum WaitUntil {
152    /// Wait for the `Page.domContentEventFired` CDP event — fires when the HTML
153    /// document has been fully parsed and the DOM is ready, before subresources
154    /// such as images and stylesheets finish loading.
155    DomContentLoaded,
156    /// Wait for the `Page.loadEventFired` CDP event **and** then wait until no
157    /// more than 2 network requests are in-flight for at least 500 ms
158    /// (equivalent to Playwright's `networkidle2`).
159    NetworkIdle,
160    /// Wait until `document.querySelector(selector)` returns a non-null element.
161    Selector(String),
162}
163
164// ─── NodeHandle ───────────────────────────────────────────────────────────────
165
166/// A handle to a live DOM node backed by a CDP `RemoteObjectId`.
167///
168/// Obtained via [`PageHandle::query_selector_all`].  Each method issues one or
169/// more CDP `Runtime.callFunctionOn` calls against the held V8 remote object
170/// reference — no HTML serialisation occurs.
171///
172/// A handle becomes **stale** after page navigation or if the underlying DOM
173/// node is removed.  Stale calls return [`BrowserError::StaleNode`] so callers
174/// can distinguish them from other CDP failures.
175///
176/// # Example
177///
178/// ```no_run
179/// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
180/// use std::time::Duration;
181///
182/// # async fn run() -> stygian_browser::error::Result<()> {
183/// let pool = BrowserPool::new(BrowserConfig::default()).await?;
184/// let handle = pool.acquire().await?;
185/// let mut page = handle.browser().expect("valid browser").new_page().await?;
186/// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
187///
188/// for node in page.query_selector_all("a[href]").await? {
189///     let href = node.attr("href").await?;
190///     let text = node.text_content().await?;
191///     println!("{text}: {href:?}");
192/// }
193/// # Ok(())
194/// # }
195/// ```
196pub struct NodeHandle {
197    element: chromiumoxide::element::Element,
198    /// Original CSS selector — preserved for stale-node error messages only.
199    /// Shared via `Arc<str>` so all handles from a single query reuse the
200    /// same allocation rather than cloning a `String` per node.
201    selector: Arc<str>,
202    cdp_timeout: Duration,
203    /// Cloned page reference used only for document-level element resolution
204    /// during DOM traversal (parent / sibling navigation).
205    page: chromiumoxide::Page,
206}
207
208impl NodeHandle {
209    /// Return a single attribute value, or `None` if the attribute is absent.
210    ///
211    /// Issues one `Runtime.callFunctionOn` CDP call (`el.getAttribute(name)`).
212    ///
213    /// # Errors
214    ///
215    /// Returns [`BrowserError::StaleNode`] when the remote object has been
216    /// invalidated, or [`BrowserError::Timeout`] / [`BrowserError::CdpError`]
217    /// on transport-level failures.
218    pub async fn attr(&self, name: &str) -> Result<Option<String>> {
219        timeout(self.cdp_timeout, self.element.attribute(name))
220            .await
221            .map_err(|_| BrowserError::Timeout {
222                operation: "NodeHandle::attr".to_string(),
223                duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
224            })?
225            .map_err(|e| self.cdp_err_or_stale(&e, "attr"))
226    }
227
228    /// Return all attributes as a `HashMap<name, value>` in a **single**
229    /// CDP round-trip.
230    ///
231    /// Uses `DOM.getAttributes` (via the chromiumoxide `attributes()` API)
232    /// which returns a flat `[name, value, name, value, …]` list from the node
233    /// description — no per-attribute calls are needed.
234    ///
235    /// # Errors
236    ///
237    /// Returns [`BrowserError::StaleNode`] when the remote object has been
238    /// invalidated.
239    pub async fn attr_map(&self) -> Result<HashMap<String, String>> {
240        let flat = timeout(self.cdp_timeout, self.element.attributes())
241            .await
242            .map_err(|_| BrowserError::Timeout {
243                operation: "NodeHandle::attr_map".to_string(),
244                duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
245            })?
246            .map_err(|e| self.cdp_err_or_stale(&e, "attr_map"))?;
247
248        let mut map = HashMap::with_capacity(flat.len() / 2);
249        for pair in flat.chunks_exact(2) {
250            if let [name, value] = pair {
251                map.insert(name.clone(), value.clone());
252            }
253        }
254        Ok(map)
255    }
256
257    /// Return the element's `textContent` (all text inside, no markup).
258    ///
259    /// Reads the DOM `textContent` property via a single JS eval — this is the
260    /// raw text concatenation of all descendant text nodes, independent of
261    /// layout or visibility (unlike `innerText`).
262    ///
263    /// Returns an empty string when the property is absent or null.
264    ///
265    /// # Errors
266    ///
267    /// Returns [`BrowserError::StaleNode`] when the remote object has been
268    /// invalidated.
269    pub async fn text_content(&self) -> Result<String> {
270        let returns = timeout(
271            self.cdp_timeout,
272            self.element
273                .call_js_fn(r"function() { return this.textContent ?? ''; }", true),
274        )
275        .await
276        .map_err(|_| BrowserError::Timeout {
277            operation: "NodeHandle::text_content".to_string(),
278            duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
279        })?
280        .map_err(|e| self.cdp_err_or_stale(&e, "text_content"))?;
281
282        Ok(returns
283            .result
284            .value
285            .as_ref()
286            .and_then(|v| v.as_str())
287            .unwrap_or("")
288            .to_string())
289    }
290
291    /// Return the element's `innerHTML`.
292    ///
293    /// Returns an empty string when the property is absent or null.
294    ///
295    /// # Errors
296    ///
297    /// Returns [`BrowserError::StaleNode`] when the remote object has been
298    /// invalidated.
299    pub async fn inner_html(&self) -> Result<String> {
300        timeout(self.cdp_timeout, self.element.inner_html())
301            .await
302            .map_err(|_| BrowserError::Timeout {
303                operation: "NodeHandle::inner_html".to_string(),
304                duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
305            })?
306            .map_err(|e| self.cdp_err_or_stale(&e, "inner_html"))
307            .map(Option::unwrap_or_default)
308    }
309
310    /// Return the element's `outerHTML`.
311    ///
312    /// Returns an empty string when the property is absent or null.
313    ///
314    /// # Errors
315    ///
316    /// Returns [`BrowserError::StaleNode`] when the remote object has been
317    /// invalidated.
318    pub async fn outer_html(&self) -> Result<String> {
319        timeout(self.cdp_timeout, self.element.outer_html())
320            .await
321            .map_err(|_| BrowserError::Timeout {
322                operation: "NodeHandle::outer_html".to_string(),
323                duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
324            })?
325            .map_err(|e| self.cdp_err_or_stale(&e, "outer_html"))
326            .map(Option::unwrap_or_default)
327    }
328
329    /// Return the ancestor tag-name chain, root-last.
330    ///
331    /// Executes a single `Runtime.callFunctionOn` JavaScript function that
332    /// walks `parentElement` and collects tag names — no repeated CDP calls.
333    ///
334    /// ```text
335    /// // for <span> inside <p> inside <article> inside <body> inside <html>
336    /// ["p", "article", "body", "html"]
337    /// ```
338    ///
339    /// # Errors
340    ///
341    /// Returns [`BrowserError::StaleNode`] when the remote object has been
342    /// invalidated, or [`BrowserError::ScriptExecutionFailed`] when CDP
343    /// returns no value or the value is not a string array.
344    pub async fn ancestors(&self) -> Result<Vec<String>> {
345        let returns = timeout(
346            self.cdp_timeout,
347            self.element.call_js_fn(
348                r"function() {
349                    const a = [];
350                    let n = this.parentElement;
351                    while (n) { a.push(n.tagName.toLowerCase()); n = n.parentElement; }
352                    return a;
353                }",
354                true,
355            ),
356        )
357        .await
358        .map_err(|_| BrowserError::Timeout {
359            operation: "NodeHandle::ancestors".to_string(),
360            duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
361        })?
362        .map_err(|e| self.cdp_err_or_stale(&e, "ancestors"))?;
363
364        // With returnByValue=true and an array return, CDP delivers the value
365        // as a JSON array directly — no JSON.stringify/re-parse needed.
366        // A missing or wrong-type value indicates an unexpected CDP failure.
367        let arr = returns
368            .result
369            .value
370            .as_ref()
371            .and_then(|v| v.as_array())
372            .ok_or_else(|| BrowserError::ScriptExecutionFailed {
373                script: "NodeHandle::ancestors".to_string(),
374                reason: "CDP returned no value or a non-array value for ancestors()".to_string(),
375            })?;
376
377        arr.iter()
378            .map(|v| {
379                v.as_str().map(ToString::to_string).ok_or_else(|| {
380                    BrowserError::ScriptExecutionFailed {
381                        script: "NodeHandle::ancestors".to_string(),
382                        reason: format!("ancestor entry is not a string: {v}"),
383                    }
384                })
385            })
386            .collect()
387    }
388
389    /// Return child elements matching `selector` as new [`NodeHandle`]s.
390    ///
391    /// Issues a single `Runtime.callFunctionOn` + `DOM.querySelectorAll`
392    /// call scoped to this element — not to the entire document.
393    ///
394    /// Returns an empty `Vec` when no children match (consistent with the JS
395    /// `querySelectorAll` contract).
396    ///
397    /// # Errors
398    ///
399    /// Returns [`BrowserError::StaleNode`] when the remote object has been
400    /// invalidated, or [`BrowserError::CdpError`] on transport failure.
401    pub async fn children_matching(&self, selector: &str) -> Result<Vec<Self>> {
402        let elements = timeout(self.cdp_timeout, self.element.find_elements(selector))
403            .await
404            .map_err(|_| BrowserError::Timeout {
405                operation: "NodeHandle::children_matching".to_string(),
406                duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
407            })?
408            .map_err(|e| self.cdp_err_or_stale(&e, "children_matching"))?;
409
410        let selector_arc: Arc<str> = Arc::from(selector);
411        Ok(elements
412            .into_iter()
413            .map(|el| Self {
414                element: el,
415                selector: selector_arc.clone(),
416                cdp_timeout: self.cdp_timeout,
417                page: self.page.clone(),
418            })
419            .collect())
420    }
421
422    /// Return the immediate parent element, or `None` if this element has no
423    /// parent (i.e. it is the document root).
424    ///
425    /// Issues a single `Runtime.callFunctionOn` CDP call that temporarily tags
426    /// the parent element with a unique attribute, then resolves it via a
427    /// document-level `DOM.querySelector` before removing the tag.
428    ///
429    /// # Errors
430    ///
431    /// Returns [`BrowserError::StaleNode`] when the remote object has been
432    /// invalidated.
433    ///
434    /// # Example
435    ///
436    /// ```no_run
437    /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
438    /// use std::time::Duration;
439    ///
440    /// # async fn run() -> stygian_browser::error::Result<()> {
441    /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
442    /// let handle = pool.acquire().await?;
443    /// let mut page = handle.browser().expect("valid browser").new_page().await?;
444    /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
445    /// let nodes = page.query_selector_all("p").await?;
446    /// if let Some(parent) = nodes[0].parent().await? {
447    ///     let html = parent.outer_html().await?;
448    ///     println!("parent: {}", &html[..html.len().min(80)]);
449    /// }
450    /// # Ok(())
451    /// # }
452    /// ```
453    pub async fn parent(&self) -> Result<Option<Self>> {
454        let attr = format!(
455            "data-stygian-t-{}",
456            ulid::Ulid::new().to_string().to_lowercase()
457        );
458        let js = format!(
459            "function() {{ \
460                var t = this.parentElement; \
461                if (!t) {{ return false; }} \
462                t.setAttribute('{attr}', '1'); \
463                return true; \
464            }}"
465        );
466        self.call_traversal(&js, &attr, "parent").await
467    }
468
469    /// Return the next element sibling, or `None` if this element is the last
470    /// child of its parent.
471    ///
472    /// Uses `nextElementSibling` (skips text/comment nodes).
473    ///
474    /// # Errors
475    ///
476    /// Returns [`BrowserError::StaleNode`] when the remote object has been
477    /// invalidated.
478    ///
479    /// # Example
480    ///
481    /// ```no_run
482    /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
483    /// use std::time::Duration;
484    ///
485    /// # async fn run() -> stygian_browser::error::Result<()> {
486    /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
487    /// let handle = pool.acquire().await?;
488    /// let mut page = handle.browser().expect("valid browser").new_page().await?;
489    /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
490    /// let nodes = page.query_selector_all("li").await?;
491    /// if let Some(next) = nodes[0].next_sibling().await? {
492    ///     println!("next sibling: {}", next.text_content().await?);
493    /// }
494    /// # Ok(())
495    /// # }
496    /// ```
497    pub async fn next_sibling(&self) -> Result<Option<Self>> {
498        let attr = format!(
499            "data-stygian-t-{}",
500            ulid::Ulid::new().to_string().to_lowercase()
501        );
502        let js = format!(
503            "function() {{ \
504                var t = this.nextElementSibling; \
505                if (!t) {{ return false; }} \
506                t.setAttribute('{attr}', '1'); \
507                return true; \
508            }}"
509        );
510        self.call_traversal(&js, &attr, "next").await
511    }
512
513    /// Return the previous element sibling, or `None` if this element is the
514    /// first child of its parent.
515    ///
516    /// Uses `previousElementSibling` (skips text/comment nodes).
517    ///
518    /// # Errors
519    ///
520    /// Returns [`BrowserError::StaleNode`] when the remote object has been
521    /// invalidated.
522    ///
523    /// # Example
524    ///
525    /// ```no_run
526    /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
527    /// use std::time::Duration;
528    ///
529    /// # async fn run() -> stygian_browser::error::Result<()> {
530    /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
531    /// let handle = pool.acquire().await?;
532    /// let mut page = handle.browser().expect("valid browser").new_page().await?;
533    /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
534    /// let nodes = page.query_selector_all("li").await?;
535    /// if let Some(prev) = nodes[1].previous_sibling().await? {
536    ///     println!("prev sibling: {}", prev.text_content().await?);
537    /// }
538    /// # Ok(())
539    /// # }
540    /// ```
541    pub async fn previous_sibling(&self) -> Result<Option<Self>> {
542        let attr = format!(
543            "data-stygian-t-{}",
544            ulid::Ulid::new().to_string().to_lowercase()
545        );
546        let js = format!(
547            "function() {{ \
548                var t = this.previousElementSibling; \
549                if (!t) {{ return false; }} \
550                t.setAttribute('{attr}', '1'); \
551                return true; \
552            }}"
553        );
554        self.call_traversal(&js, &attr, "prev").await
555    }
556
557    /// Shared traversal implementation used by [`parent`], [`next_sibling`],
558    /// and [`previous_sibling`].
559    ///
560    /// The caller provides a JS function that:
561    /// 1. Navigates to the target element (parent / sibling).
562    /// 2. If the target is non-null, sets a unique attribute (`attr_name`)
563    ///    on it and returns `true`.
564    /// 3. Returns `false` when the target is null (no such neighbour).
565    ///
566    /// This helper then resolves the tagged element from the document root,
567    /// removes the temporary attribute, and wraps the result in a
568    /// `NodeHandle`.
569    ///
570    /// [`parent`]: Self::parent
571    /// [`next_sibling`]: Self::next_sibling
572    /// [`previous_sibling`]: Self::previous_sibling
573    async fn call_traversal(
574        &self,
575        js_fn: &str,
576        attr_name: &str,
577        selector_suffix: &str,
578    ) -> Result<Option<Self>> {
579        // Step 1: Run the JS that tags the target element and reports null/non-null.
580        let op_tag = format!("NodeHandle::{selector_suffix}::tag");
581        let returns = timeout(self.cdp_timeout, self.element.call_js_fn(js_fn, false))
582            .await
583            .map_err(|_| BrowserError::Timeout {
584                operation: op_tag.clone(),
585                duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
586            })?
587            .map_err(|e| self.cdp_err_or_stale(&e, selector_suffix))?;
588
589        // JS returns false → no such neighbour.
590        let has_target = returns
591            .result
592            .value
593            .as_ref()
594            .and_then(serde_json::Value::as_bool)
595            .unwrap_or(false);
596        if !has_target {
597            return Ok(None);
598        }
599
600        // Step 2: Resolve the tagged element via a document-level querySelector.
601        let css = format!("[{attr_name}]");
602        let op_resolve = format!("NodeHandle::{selector_suffix}::resolve");
603        let element = timeout(self.cdp_timeout, self.page.find_element(css))
604            .await
605            .map_err(|_| BrowserError::Timeout {
606                operation: op_resolve.clone(),
607                duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
608            })?
609            .map_err(|e| BrowserError::CdpError {
610                operation: op_resolve,
611                message: e.to_string(),
612            })?;
613
614        // Step 3: Remove the temporary attribute (best-effort; a failure here
615        // is non-fatal — it leaves a harmless stale attribute in the DOM).
616        let cleanup = format!("function() {{ this.removeAttribute('{attr_name}'); }}");
617        let _ = element.call_js_fn(cleanup, false).await;
618
619        // Step 4: Wrap in a NodeHandle with the diagnostic selector suffix.
620        let new_selector: Arc<str> =
621            Arc::from(format!("{}::{selector_suffix}", self.selector).as_str());
622        Ok(Some(Self {
623            element,
624            selector: new_selector,
625            cdp_timeout: self.cdp_timeout,
626            page: self.page.clone(),
627        }))
628    }
629
630    /// Map a chromiumoxide `CdpError` to either [`BrowserError::StaleNode`]
631    /// (when the remote object reference has been invalidated) or
632    /// [`BrowserError::CdpError`] for all other failures.
633    fn cdp_err_or_stale(
634        &self,
635        err: &chromiumoxide::error::CdpError,
636        operation: &str,
637    ) -> BrowserError {
638        let msg = err.to_string();
639        if msg.contains("Cannot find object with id")
640            || msg.contains("context with specified id")
641            || msg.contains("Cannot find context")
642        {
643            BrowserError::StaleNode {
644                selector: self.selector.to_string(),
645            }
646        } else {
647            BrowserError::CdpError {
648                operation: operation.to_string(),
649                message: msg,
650            }
651        }
652    }
653}
654
655// ─── PageHandle ───────────────────────────────────────────────────────────────
656
657/// A handle to an open browser tab.
658///
659/// On drop the underlying page is closed automatically.
660///
661/// # Example
662///
663/// ```no_run
664/// use stygian_browser::{BrowserPool, BrowserConfig};
665/// use stygian_browser::page::WaitUntil;
666/// use std::time::Duration;
667///
668/// # async fn run() -> stygian_browser::error::Result<()> {
669/// let pool = BrowserPool::new(BrowserConfig::default()).await?;
670/// let handle = pool.acquire().await?;
671/// let mut page = handle.browser().expect("valid browser").new_page().await?;
672/// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
673/// let html = page.content().await?;
674/// drop(page); // closes the tab
675/// handle.release().await;
676/// # Ok(())
677/// # }
678/// ```
679pub struct PageHandle {
680    page: Page,
681    cdp_timeout: Duration,
682    /// HTTP status code of the most recent main-frame navigation, or `0` if not
683    /// yet captured.  Written atomically by the listener spawned in `navigate()`.
684    last_status_code: Arc<AtomicU16>,
685    /// Background task processing `Fetch.requestPaused` events. Aborted and
686    /// replaced each time `set_resource_filter` is called.
687    resource_filter_task: Option<tokio::task::JoinHandle<()>>,
688}
689
690impl PageHandle {
691    /// Wrap a raw chromiumoxide [`Page`] in a handle.
692    pub(crate) fn new(page: Page, cdp_timeout: Duration) -> Self {
693        Self {
694            page,
695            cdp_timeout,
696            last_status_code: Arc::new(AtomicU16::new(0)),
697            resource_filter_task: None,
698        }
699    }
700
701    /// Navigate to `url` and wait for `condition` within `nav_timeout`.
702    ///
703    /// # Errors
704    ///
705    /// Returns [`BrowserError::NavigationFailed`] if the navigation times out or
706    /// the CDP call fails.
707    pub async fn navigate(
708        &mut self,
709        url: &str,
710        condition: WaitUntil,
711        nav_timeout: Duration,
712    ) -> Result<()> {
713        self.setup_status_capture().await;
714        timeout(
715            nav_timeout,
716            self.navigate_inner(url, condition, nav_timeout),
717        )
718        .await
719        .map_err(|_| BrowserError::NavigationFailed {
720            url: url.to_string(),
721            reason: format!("navigation timed out after {nav_timeout:?}"),
722        })?
723    }
724
725    /// Reset the last status code and wire up the `Network.responseReceived`
726    /// listener before any navigation starts.  Errors are logged and swallowed
727    /// so that a missing network domain never blocks navigation.
728    async fn setup_status_capture(&self) {
729        use chromiumoxide::cdp::browser_protocol::network::{
730            EventResponseReceived, ResourceType as NetworkResourceType,
731        };
732        use futures::StreamExt;
733
734        // Reset so a stale code is not returned if the new navigation fails
735        // before the response headers arrive.
736        self.last_status_code.store(0, Ordering::Release);
737
738        // Subscribe *before* goto() — the listener runs in a detached task and
739        // stores the first Document-type response status atomically.
740        let page_for_listener = self.page.clone();
741        let status_capture = Arc::clone(&self.last_status_code);
742        match page_for_listener
743            .event_listener::<EventResponseReceived>()
744            .await
745        {
746            Ok(mut stream) => {
747                tokio::spawn(async move {
748                    while let Some(event) = stream.next().await {
749                        if event.r#type == NetworkResourceType::Document {
750                            let code = u16::try_from(event.response.status).unwrap_or(0);
751                            if code > 0 {
752                                status_capture.store(code, Ordering::Release);
753                            }
754                            break;
755                        }
756                    }
757                });
758            }
759            Err(e) => warn!("status-code capture unavailable: {e}"),
760        }
761    }
762
763    /// Subscribe to the appropriate CDP events, fire `goto`, then await
764    /// `condition`.  All subscriptions precede `goto` to eliminate the race
765    /// described in issue #7.
766    async fn navigate_inner(
767        &self,
768        url: &str,
769        condition: WaitUntil,
770        nav_timeout: Duration,
771    ) -> Result<()> {
772        use chromiumoxide::cdp::browser_protocol::page::{
773            EventDomContentEventFired, EventLoadEventFired,
774        };
775        use futures::StreamExt;
776
777        let url_owned = url.to_string();
778
779        let mut dom_events = match &condition {
780            WaitUntil::DomContentLoaded => Some(
781                self.page
782                    .event_listener::<EventDomContentEventFired>()
783                    .await
784                    .map_err(|e| BrowserError::NavigationFailed {
785                        url: url_owned.clone(),
786                        reason: e.to_string(),
787                    })?,
788            ),
789            _ => None,
790        };
791
792        let mut load_events = match &condition {
793            WaitUntil::NetworkIdle => Some(
794                self.page
795                    .event_listener::<EventLoadEventFired>()
796                    .await
797                    .map_err(|e| BrowserError::NavigationFailed {
798                        url: url_owned.clone(),
799                        reason: e.to_string(),
800                    })?,
801            ),
802            _ => None,
803        };
804
805        let inflight = if matches!(condition, WaitUntil::NetworkIdle) {
806            Some(self.subscribe_inflight_counter().await)
807        } else {
808            None
809        };
810
811        self.page
812            .goto(url)
813            .await
814            .map_err(|e| BrowserError::NavigationFailed {
815                url: url_owned.clone(),
816                reason: e.to_string(),
817            })?;
818
819        match &condition {
820            WaitUntil::DomContentLoaded => {
821                if let Some(ref mut events) = dom_events {
822                    let _ = events.next().await;
823                }
824            }
825            WaitUntil::NetworkIdle => {
826                if let Some(ref mut events) = load_events {
827                    let _ = events.next().await;
828                }
829                if let Some(ref counter) = inflight {
830                    Self::wait_network_idle(counter).await;
831                }
832            }
833            WaitUntil::Selector(css) => {
834                self.wait_for_selector(css, nav_timeout).await?;
835            }
836        }
837        Ok(())
838    }
839
840    /// Spawn three detached tasks that maintain a signed in-flight request
841    /// counter via `Network.requestWillBeSent` (+1) and
842    /// `Network.loadingFinished`/`Network.loadingFailed` (−1 each).
843    /// Returns the shared counter so the caller can poll it.
844    async fn subscribe_inflight_counter(&self) -> Arc<std::sync::atomic::AtomicI32> {
845        use std::sync::atomic::AtomicI32;
846
847        use chromiumoxide::cdp::browser_protocol::network::{
848            EventLoadingFailed, EventLoadingFinished, EventRequestWillBeSent,
849        };
850        use futures::StreamExt;
851
852        let counter: Arc<AtomicI32> = Arc::new(AtomicI32::new(0));
853        let pairs: [(Arc<AtomicI32>, i32); 3] = [
854            (Arc::clone(&counter), 1),
855            (Arc::clone(&counter), -1),
856            (Arc::clone(&counter), -1),
857        ];
858        let [p1, p2, p3] = [self.page.clone(), self.page.clone(), self.page.clone()];
859
860        macro_rules! spawn_tracker {
861            ($page:expr, $event:ty, $c:expr, $delta:expr) => {
862                match $page.event_listener::<$event>().await {
863                    Ok(mut s) => {
864                        let c = $c;
865                        let d = $delta;
866                        tokio::spawn(async move {
867                            while s.next().await.is_some() {
868                                c.fetch_add(d, Ordering::Relaxed);
869                            }
870                        });
871                    }
872                    Err(e) => warn!("network-idle tracker unavailable: {e}"),
873                }
874            };
875        }
876
877        let [(c1, d1), (c2, d2), (c3, d3)] = pairs;
878        spawn_tracker!(p1, EventRequestWillBeSent, c1, d1);
879        spawn_tracker!(p2, EventLoadingFinished, c2, d2);
880        spawn_tracker!(p3, EventLoadingFailed, c3, d3);
881
882        counter
883    }
884
885    /// Poll `counter` until ≤ 2 in-flight requests persist for 500 ms
886    /// (equivalent to Playwright's `networkidle2`).
887    async fn wait_network_idle(counter: &Arc<std::sync::atomic::AtomicI32>) {
888        const IDLE_THRESHOLD: i32 = 2;
889        const SETTLE: Duration = Duration::from_millis(500);
890        loop {
891            if counter.load(Ordering::Relaxed) <= IDLE_THRESHOLD {
892                tokio::time::sleep(SETTLE).await;
893                if counter.load(Ordering::Relaxed) <= IDLE_THRESHOLD {
894                    break;
895                }
896            } else {
897                tokio::time::sleep(Duration::from_millis(50)).await;
898            }
899        }
900    }
901
902    /// Wait until `document.querySelector(selector)` is non-null (`timeout`).
903    ///
904    /// # Errors
905    ///
906    /// Returns [`BrowserError::NavigationFailed`] if the selector is not found
907    /// within the given timeout.
908    pub async fn wait_for_selector(&self, selector: &str, wait_timeout: Duration) -> Result<()> {
909        let selector_owned = selector.to_string();
910        let poll = async {
911            loop {
912                if self.page.find_element(selector_owned.clone()).await.is_ok() {
913                    return Ok(());
914                }
915                tokio::time::sleep(Duration::from_millis(100)).await;
916            }
917        };
918
919        timeout(wait_timeout, poll)
920            .await
921            .map_err(|_| BrowserError::NavigationFailed {
922                url: String::new(),
923                reason: format!("selector '{selector_owned}' not found within {wait_timeout:?}"),
924            })?
925    }
926
927    /// Set a resource filter to block specific network request types.
928    ///
929    /// Enables `Fetch` interception and spawns a background task that continues
930    /// allowed requests and fails blocked ones with `BlockedByClient`. Any
931    /// previously set filter task is cancelled first.
932    ///
933    /// # Errors
934    ///
935    /// Returns a [`BrowserError::CdpError`] if the CDP call fails.
936    pub async fn set_resource_filter(&mut self, filter: ResourceFilter) -> Result<()> {
937        use chromiumoxide::cdp::browser_protocol::fetch::{
938            ContinueRequestParams, EnableParams, EventRequestPaused, FailRequestParams,
939            RequestPattern,
940        };
941        use chromiumoxide::cdp::browser_protocol::network::ErrorReason;
942        use futures::StreamExt as _;
943
944        if filter.is_empty() {
945            return Ok(());
946        }
947
948        // Cancel any previously running filter task.
949        if let Some(task) = self.resource_filter_task.take() {
950            task.abort();
951        }
952
953        let pattern = RequestPattern::builder().url_pattern("*").build();
954        let params = EnableParams::builder()
955            .patterns(vec![pattern])
956            .handle_auth_requests(false)
957            .build();
958
959        timeout(self.cdp_timeout, self.page.execute::<EnableParams>(params))
960            .await
961            .map_err(|_| BrowserError::Timeout {
962                operation: "Fetch.enable".to_string(),
963                duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
964            })?
965            .map_err(|e| BrowserError::CdpError {
966                operation: "Fetch.enable".to_string(),
967                message: e.to_string(),
968            })?;
969
970        // Subscribe to requestPaused events and dispatch each one so navigation
971        // is never blocked. Without this handler Chrome holds every intercepted
972        // request indefinitely and the page hangs.
973        let mut events = self
974            .page
975            .event_listener::<EventRequestPaused>()
976            .await
977            .map_err(|e| BrowserError::CdpError {
978                operation: "Fetch.requestPaused subscribe".to_string(),
979                message: e.to_string(),
980            })?;
981
982        let page = self.page.clone();
983        debug!("Resource filter active: {:?}", filter);
984        let task = tokio::spawn(async move {
985            while let Some(event) = events.next().await {
986                let request_id = event.request_id.clone();
987                if filter.should_block(event.resource_type.as_ref()) {
988                    let params = FailRequestParams::new(request_id, ErrorReason::BlockedByClient);
989                    let _ = page.execute(params).await;
990                } else {
991                    let _ = page.execute(ContinueRequestParams::new(request_id)).await;
992                }
993            }
994        });
995
996        self.resource_filter_task = Some(task);
997        Ok(())
998    }
999
1000    /// Return the current page URL (post-navigation, post-redirect).
1001    ///
1002    /// Delegates to the CDP `Target.getTargetInfo` binding already used
1003    /// internally by [`save_cookies`](Self::save_cookies); no extra network
1004    /// request is made.  Returns an empty string if the URL is not yet set
1005    /// (e.g. on a blank tab before the first navigation).
1006    ///
1007    /// # Errors
1008    ///
1009    /// Returns [`BrowserError::CdpError`] if the underlying CDP call fails, or
1010    /// [`BrowserError::Timeout`] if it exceeds `cdp_timeout`.
1011    ///
1012    /// # Example
1013    ///
1014    /// ```no_run
1015    /// use stygian_browser::{BrowserPool, BrowserConfig};
1016    /// use stygian_browser::page::WaitUntil;
1017    /// use std::time::Duration;
1018    ///
1019    /// # async fn run() -> stygian_browser::error::Result<()> {
1020    /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1021    /// let handle = pool.acquire().await?;
1022    /// let mut page = handle.browser().expect("valid browser").new_page().await?;
1023    /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
1024    /// let url = page.url().await?;
1025    /// println!("Final URL after redirects: {url}");
1026    /// # Ok(())
1027    /// # }
1028    /// ```
1029    pub async fn url(&self) -> Result<String> {
1030        timeout(self.cdp_timeout, self.page.url())
1031            .await
1032            .map_err(|_| BrowserError::Timeout {
1033                operation: "page.url".to_string(),
1034                duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1035            })?
1036            .map_err(|e| BrowserError::CdpError {
1037                operation: "page.url".to_string(),
1038                message: e.to_string(),
1039            })
1040            .map(Option::unwrap_or_default)
1041    }
1042
1043    /// Return the HTTP status code of the most recent main-frame navigation.
1044    ///
1045    /// The status is captured from the `Network.responseReceived` CDP event
1046    /// wired up inside [`navigate`](Self::navigate), so it reflects the
1047    /// *final* response after any server-side redirects.
1048    ///
1049    /// Returns `None` if the status was not captured — for example on `file://`
1050    /// navigations, when [`navigate`](Self::navigate) has not yet been called,
1051    /// or if the network event subscription failed.
1052    ///
1053    /// # Errors
1054    ///
1055    /// This method is infallible; the `Result` wrapper is kept for API
1056    /// consistency with other `PageHandle` methods.
1057    ///
1058    /// # Example
1059    ///
1060    /// ```no_run
1061    /// use stygian_browser::{BrowserPool, BrowserConfig};
1062    /// use stygian_browser::page::WaitUntil;
1063    /// use std::time::Duration;
1064    ///
1065    /// # async fn run() -> stygian_browser::error::Result<()> {
1066    /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1067    /// let handle = pool.acquire().await?;
1068    /// let mut page = handle.browser().expect("valid browser").new_page().await?;
1069    /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
1070    /// if let Some(code) = page.status_code()? {
1071    ///     println!("HTTP {code}");
1072    /// }
1073    /// # Ok(())
1074    /// # }
1075    /// ```
1076    pub fn status_code(&self) -> Result<Option<u16>> {
1077        let code = self.last_status_code.load(Ordering::Acquire);
1078        Ok(if code == 0 { None } else { Some(code) })
1079    }
1080
1081    /// Return the page's `<title>` text.
1082    ///
1083    /// # Errors
1084    ///
1085    /// Returns [`BrowserError::ScriptExecutionFailed`] if the evaluation fails.
1086    pub async fn title(&self) -> Result<String> {
1087        timeout(self.cdp_timeout, self.page.get_title())
1088            .await
1089            .map_err(|_| BrowserError::Timeout {
1090                operation: "get_title".to_string(),
1091                duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1092            })?
1093            .map_err(|e| BrowserError::ScriptExecutionFailed {
1094                script: "document.title".to_string(),
1095                reason: e.to_string(),
1096            })
1097            .map(Option::unwrap_or_default)
1098    }
1099
1100    /// Return the page's full outer HTML.
1101    ///
1102    /// # Errors
1103    ///
1104    /// Returns [`BrowserError::ScriptExecutionFailed`] if the evaluation fails.
1105    pub async fn content(&self) -> Result<String> {
1106        timeout(self.cdp_timeout, self.page.content())
1107            .await
1108            .map_err(|_| BrowserError::Timeout {
1109                operation: "page.content".to_string(),
1110                duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1111            })?
1112            .map_err(|e| BrowserError::ScriptExecutionFailed {
1113                script: "document.documentElement.outerHTML".to_string(),
1114                reason: e.to_string(),
1115            })
1116    }
1117
1118    /// Query the live DOM for all elements matching `selector` and return
1119    /// lightweight [`NodeHandle`]s backed by CDP `RemoteObjectId`s.
1120    ///
1121    /// No HTML serialisation occurs — the browser's in-memory DOM is queried
1122    /// directly over the CDP connection, eliminating the `page.content()` +
1123    /// `scraper::Html::parse_document` round-trip.
1124    ///
1125    /// Returns an empty `Vec` when no elements match (consistent with the JS
1126    /// `querySelectorAll` contract — not an error).
1127    ///
1128    /// # Errors
1129    ///
1130    /// Returns [`BrowserError::CdpError`] if the CDP find call fails, or
1131    /// [`BrowserError::Timeout`] if it exceeds `cdp_timeout`.
1132    ///
1133    /// # Example
1134    ///
1135    /// ```no_run
1136    /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
1137    /// use std::time::Duration;
1138    ///
1139    /// # async fn run() -> stygian_browser::error::Result<()> {
1140    /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1141    /// let handle = pool.acquire().await?;
1142    /// let mut page = handle.browser().expect("valid browser").new_page().await?;
1143    /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
1144    ///
1145    /// let nodes = page.query_selector_all("[data-ux]").await?;
1146    /// for node in &nodes {
1147    ///     let ux_type = node.attr("data-ux").await?;
1148    ///     let text    = node.text_content().await?;
1149    ///     println!("{ux_type:?}: {text}");
1150    /// }
1151    /// # Ok(())
1152    /// # }
1153    /// ```
1154    pub async fn query_selector_all(&self, selector: &str) -> Result<Vec<NodeHandle>> {
1155        let elements = timeout(self.cdp_timeout, self.page.find_elements(selector))
1156            .await
1157            .map_err(|_| BrowserError::Timeout {
1158                operation: "PageHandle::query_selector_all".to_string(),
1159                duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1160            })?
1161            .map_err(|e| BrowserError::CdpError {
1162                operation: "PageHandle::query_selector_all".to_string(),
1163                message: e.to_string(),
1164            })?;
1165
1166        let selector_arc: Arc<str> = Arc::from(selector);
1167        Ok(elements
1168            .into_iter()
1169            .map(|el| NodeHandle {
1170                element: el,
1171                selector: selector_arc.clone(),
1172                cdp_timeout: self.cdp_timeout,
1173                page: self.page.clone(),
1174            })
1175            .collect())
1176    }
1177
1178    /// Evaluate arbitrary JavaScript and return the result as `T`.
1179    ///
1180    /// # Errors
1181    ///
1182    /// Returns [`BrowserError::ScriptExecutionFailed`] on eval failure or
1183    /// deserialization error.
1184    pub async fn eval<T: serde::de::DeserializeOwned>(&self, script: &str) -> Result<T> {
1185        let script_owned = script.to_string();
1186        timeout(self.cdp_timeout, self.page.evaluate(script))
1187            .await
1188            .map_err(|_| BrowserError::Timeout {
1189                operation: "page.evaluate".to_string(),
1190                duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1191            })?
1192            .map_err(|e| BrowserError::ScriptExecutionFailed {
1193                script: script_owned.clone(),
1194                reason: e.to_string(),
1195            })?
1196            .into_value::<T>()
1197            .map_err(|e| BrowserError::ScriptExecutionFailed {
1198                script: script_owned,
1199                reason: e.to_string(),
1200            })
1201    }
1202
1203    /// Save all cookies for the current page's origin.
1204    ///
1205    /// # Errors
1206    ///
1207    /// Returns [`BrowserError::CdpError`] if the CDP call fails.
1208    pub async fn save_cookies(
1209        &self,
1210    ) -> Result<Vec<chromiumoxide::cdp::browser_protocol::network::Cookie>> {
1211        use chromiumoxide::cdp::browser_protocol::network::GetCookiesParams;
1212
1213        let url = self
1214            .page
1215            .url()
1216            .await
1217            .map_err(|e| BrowserError::CdpError {
1218                operation: "page.url".to_string(),
1219                message: e.to_string(),
1220            })?
1221            .unwrap_or_default();
1222
1223        timeout(
1224            self.cdp_timeout,
1225            self.page
1226                .execute(GetCookiesParams::builder().urls(vec![url]).build()),
1227        )
1228        .await
1229        .map_err(|_| BrowserError::Timeout {
1230            operation: "Network.getCookies".to_string(),
1231            duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1232        })?
1233        .map_err(|e| BrowserError::CdpError {
1234            operation: "Network.getCookies".to_string(),
1235            message: e.to_string(),
1236        })
1237        .map(|r| r.cookies.clone())
1238    }
1239
1240    /// Inject cookies into the current page.
1241    ///
1242    /// Seeds session tokens or other state without needing a full
1243    /// [`SessionSnapshot`][crate::session::SessionSnapshot] and without
1244    /// requiring a direct `chromiumoxide` dependency in calling code.
1245    ///
1246    /// Individual cookie failures are logged as warnings and do not abort the
1247    /// remaining cookies.
1248    ///
1249    /// # Errors
1250    ///
1251    /// Returns [`BrowserError::Timeout`] if a single `Network.setCookie` CDP
1252    /// call exceeds `cdp_timeout`.
1253    ///
1254    /// # Example
1255    ///
1256    /// ```no_run
1257    /// use stygian_browser::{BrowserPool, BrowserConfig};
1258    /// use stygian_browser::session::SessionCookie;
1259    /// use std::time::Duration;
1260    ///
1261    /// # async fn run() -> stygian_browser::error::Result<()> {
1262    /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1263    /// let handle = pool.acquire().await?;
1264    /// let page = handle.browser().expect("valid browser").new_page().await?;
1265    /// let cookies = vec![SessionCookie {
1266    ///     name: "session".to_string(),
1267    ///     value: "abc123".to_string(),
1268    ///     domain: ".example.com".to_string(),
1269    ///     path: "/".to_string(),
1270    ///     expires: -1.0,
1271    ///     http_only: true,
1272    ///     secure: true,
1273    ///     same_site: "Lax".to_string(),
1274    /// }];
1275    /// page.inject_cookies(&cookies).await?;
1276    /// # Ok(())
1277    /// # }
1278    /// ```
1279    pub async fn inject_cookies(&self, cookies: &[crate::session::SessionCookie]) -> Result<()> {
1280        use chromiumoxide::cdp::browser_protocol::network::SetCookieParams;
1281
1282        for cookie in cookies {
1283            let params = match SetCookieParams::builder()
1284                .name(cookie.name.clone())
1285                .value(cookie.value.clone())
1286                .domain(cookie.domain.clone())
1287                .path(cookie.path.clone())
1288                .http_only(cookie.http_only)
1289                .secure(cookie.secure)
1290                .build()
1291            {
1292                Ok(p) => p,
1293                Err(e) => {
1294                    warn!(cookie = %cookie.name, error = %e, "Failed to build cookie params");
1295                    continue;
1296                }
1297            };
1298
1299            match timeout(self.cdp_timeout, self.page.execute(params)).await {
1300                Err(_) => {
1301                    warn!(
1302                        cookie = %cookie.name,
1303                        timeout_ms = self.cdp_timeout.as_millis(),
1304                        "Timed out injecting cookie"
1305                    );
1306                }
1307                Ok(Err(e)) => {
1308                    warn!(cookie = %cookie.name, error = %e, "Failed to inject cookie");
1309                }
1310                Ok(Ok(_)) => {}
1311            }
1312        }
1313
1314        debug!(count = cookies.len(), "Cookies injected");
1315        Ok(())
1316    }
1317
1318    /// Capture a screenshot of the current page as PNG bytes.
1319    ///
1320    /// The screenshot is full-page by default (viewport clipped to the rendered
1321    /// layout area).  Save the returned bytes to a `.png` file or process
1322    /// them in-memory.
1323    ///
1324    /// # Errors
1325    ///
1326    /// Returns [`BrowserError::CdpError`] if the CDP `Page.captureScreenshot`
1327    /// command fails, or [`BrowserError::Timeout`] if it exceeds
1328    /// `cdp_timeout`.
1329    ///
1330    /// # Example
1331    ///
1332    /// ```no_run
1333    /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
1334    /// use std::{time::Duration, fs};
1335    ///
1336    /// # async fn run() -> stygian_browser::error::Result<()> {
1337    /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1338    /// let handle = pool.acquire().await?;
1339    /// let mut page = handle.browser().expect("valid browser").new_page().await?;
1340    /// page.navigate("https://example.com", WaitUntil::Selector("body".to_string()), Duration::from_secs(30)).await?;
1341    /// let png = page.screenshot().await?;
1342    /// fs::write("screenshot.png", &png).unwrap();
1343    /// # Ok(())
1344    /// # }
1345    /// ```
1346    pub async fn screenshot(&self) -> Result<Vec<u8>> {
1347        use chromiumoxide::page::ScreenshotParams;
1348
1349        let params = ScreenshotParams::builder().full_page(true).build();
1350
1351        timeout(self.cdp_timeout, self.page.screenshot(params))
1352            .await
1353            .map_err(|_| BrowserError::Timeout {
1354                operation: "Page.captureScreenshot".to_string(),
1355                duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1356            })?
1357            .map_err(|e| BrowserError::CdpError {
1358                operation: "Page.captureScreenshot".to_string(),
1359                message: e.to_string(),
1360            })
1361    }
1362
1363    /// Borrow the underlying chromiumoxide [`Page`].
1364    pub const fn inner(&self) -> &Page {
1365        &self.page
1366    }
1367
1368    /// Close this page (tab).
1369    ///
1370    /// Called automatically on drop; explicit call avoids suppressing the error.
1371    pub async fn close(self) -> Result<()> {
1372        timeout(Duration::from_secs(5), self.page.clone().close())
1373            .await
1374            .map_err(|_| BrowserError::Timeout {
1375                operation: "page.close".to_string(),
1376                duration_ms: 5000,
1377            })?
1378            .map_err(|e| BrowserError::CdpError {
1379                operation: "page.close".to_string(),
1380                message: e.to_string(),
1381            })
1382    }
1383}
1384
1385// ─── Stealth diagnostics ──────────────────────────────────────────────────────
1386
1387#[cfg(feature = "stealth")]
1388impl PageHandle {
1389    /// Run all built-in stealth detection checks against the current page.
1390    ///
1391    /// Iterates [`crate::diagnostic::all_checks`], evaluates each check's
1392    /// JavaScript via CDP `Runtime.evaluate`, and returns an aggregate
1393    /// [`crate::diagnostic::DiagnosticReport`].
1394    ///
1395    /// Failed scripts (due to JS exceptions or deserialization errors) are
1396    /// recorded as failing checks and do **not** abort the whole run.
1397    ///
1398    /// # Errors
1399    ///
1400    /// Returns an error only if the underlying CDP transport fails entirely.
1401    /// Individual check failures are captured in the report.
1402    ///
1403    /// # Example
1404    ///
1405    /// ```no_run
1406    /// # async fn run() -> stygian_browser::error::Result<()> {
1407    /// use stygian_browser::{BrowserPool, BrowserConfig};
1408    /// use stygian_browser::page::WaitUntil;
1409    /// use std::time::Duration;
1410    ///
1411    /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1412    /// let handle = pool.acquire().await?;
1413    /// let browser = handle.browser().expect("valid browser");
1414    /// let mut page = browser.new_page().await?;
1415    /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(10)).await?;
1416    ///
1417    /// let report = page.verify_stealth().await?;
1418    /// println!("Stealth: {}/{} checks passed", report.passed_count, report.checks.len());
1419    /// for failure in report.failures() {
1420    ///     eprintln!("  FAIL  {}: {}", failure.description, failure.details);
1421    /// }
1422    /// # Ok(())
1423    /// # }
1424    /// ```
1425    pub async fn verify_stealth(&self) -> Result<crate::diagnostic::DiagnosticReport> {
1426        use crate::diagnostic::{CheckResult, DiagnosticReport, all_checks};
1427
1428        let mut results: Vec<CheckResult> = Vec::new();
1429
1430        for check in all_checks() {
1431            let result = match self.eval::<String>(check.script).await {
1432                Ok(json) => check.parse_output(&json),
1433                Err(e) => {
1434                    tracing::warn!(
1435                        check = ?check.id,
1436                        error = %e,
1437                        "stealth check script failed during evaluation"
1438                    );
1439                    CheckResult {
1440                        id: check.id,
1441                        description: check.description.to_string(),
1442                        passed: false,
1443                        details: format!("script error: {e}"),
1444                    }
1445                }
1446            };
1447            tracing::debug!(
1448                check = ?result.id,
1449                passed = result.passed,
1450                details = %result.details,
1451                "stealth check result"
1452            );
1453            results.push(result);
1454        }
1455
1456        Ok(DiagnosticReport::new(results))
1457    }
1458}
1459
1460// ─── extract feature ─────────────────────────────────────────────────────────
1461
1462#[cfg(feature = "extract")]
1463impl PageHandle {
1464    /// Extract a typed collection of `T` from all elements matching `selector`.
1465    ///
1466    /// Each matched element becomes the root node for `T::extract_from`.
1467    /// Returns an empty `Vec` when no elements match (consistent with the
1468    /// `querySelectorAll` contract — not an error).
1469    ///
1470    /// All per-node extractions are driven concurrently via
1471    /// [`futures::future::try_join_all`].
1472    ///
1473    /// # Errors
1474    ///
1475    /// Returns [`BrowserError::CdpError`] if the initial `query_selector_all`
1476    /// fails, or [`BrowserError::ExtractionFailed`] if any field extraction
1477    /// fails.
1478    ///
1479    /// # Example
1480    ///
1481    /// ```no_run
1482    /// use stygian_browser::extract::Extract;
1483    /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
1484    /// use std::time::Duration;
1485    ///
1486    /// #[derive(Extract)]
1487    /// struct Link {
1488    ///     #[selector("a", attr = "href")]
1489    ///     href: Option<String>,
1490    /// }
1491    ///
1492    /// # async fn run() -> stygian_browser::error::Result<()> {
1493    /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1494    /// let handle = pool.acquire().await?;
1495    /// let mut page = handle.browser().expect("valid browser").new_page().await?;
1496    /// page.navigate(
1497    ///     "https://example.com",
1498    ///     WaitUntil::DomContentLoaded,
1499    ///     Duration::from_secs(30),
1500    /// ).await?;
1501    /// let links: Vec<Link> = page.extract_all::<Link>("nav li").await?;
1502    /// # Ok(())
1503    /// # }
1504    /// ```
1505    pub async fn extract_all<T>(&self, selector: &str) -> Result<Vec<T>>
1506    where
1507        T: crate::extract::Extractable,
1508    {
1509        use futures::future::try_join_all;
1510
1511        let nodes = self.query_selector_all(selector).await?;
1512        try_join_all(nodes.iter().map(|n| T::extract_from(n)))
1513            .await
1514            .map_err(BrowserError::ExtractionFailed)
1515    }
1516}
1517
1518// ─── similarity feature ──────────────────────────────────────────────────────
1519
1520#[cfg(feature = "similarity")]
1521impl NodeHandle {
1522    /// Compute a structural [`crate::similarity::ElementFingerprint`] for this
1523    /// node.
1524    ///
1525    /// Issues a single `Runtime.callFunctionOn` JS eval that extracts the tag,
1526    /// class list, attribute names, and body-depth in one round-trip.
1527    ///
1528    /// # Errors
1529    ///
1530    /// Returns [`BrowserError::StaleNode`] when the remote object has been
1531    /// invalidated, or [`BrowserError::ScriptExecutionFailed`] if the script
1532    /// produces unexpected output.
1533    pub async fn fingerprint(&self) -> Result<crate::similarity::ElementFingerprint> {
1534        const JS: &str = r"function() {
1535    var el = this;
1536    var tag = el.tagName.toLowerCase();
1537    var classes = Array.prototype.slice.call(el.classList).sort();
1538    var attrNames = Array.prototype.slice.call(el.attributes)
1539        .map(function(a) { return a.name; })
1540        .filter(function(n) { return n !== 'class' && n !== 'id'; })
1541        .sort();
1542    var depth = 0;
1543    var n = el.parentElement;
1544    while (n && n.tagName.toLowerCase() !== 'body') { depth++; n = n.parentElement; }
1545    return JSON.stringify({ tag: tag, classes: classes, attrNames: attrNames, depth: depth });
1546}";
1547
1548        let returns = tokio::time::timeout(self.cdp_timeout, self.element.call_js_fn(JS, true))
1549            .await
1550            .map_err(|_| BrowserError::Timeout {
1551                operation: "NodeHandle::fingerprint".to_string(),
1552                duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1553            })?
1554            .map_err(|e| self.cdp_err_or_stale(&e, "fingerprint"))?;
1555
1556        let json_str = returns
1557            .result
1558            .value
1559            .as_ref()
1560            .and_then(|v| v.as_str())
1561            .ok_or_else(|| BrowserError::ScriptExecutionFailed {
1562                script: "NodeHandle::fingerprint".to_string(),
1563                reason: "CDP returned no string value from fingerprint script".to_string(),
1564            })?;
1565
1566        serde_json::from_str::<crate::similarity::ElementFingerprint>(json_str).map_err(|e| {
1567            BrowserError::ScriptExecutionFailed {
1568                script: "NodeHandle::fingerprint".to_string(),
1569                reason: format!("failed to deserialise fingerprint JSON: {e}"),
1570            }
1571        })
1572    }
1573}
1574
1575#[cfg(feature = "similarity")]
1576impl PageHandle {
1577    /// Find all elements in the current page that are structurally similar to
1578    /// `reference`, scored by [`crate::similarity::SimilarityConfig`].
1579    ///
1580    /// Computes a structural fingerprint for `reference` (via
1581    /// [`NodeHandle::fingerprint`]), then fingerprints every candidate returned
1582    /// by `document.querySelectorAll("*")` and collects those whose
1583    /// [`crate::similarity::jaccard_weighted`] score exceeds
1584    /// `config.threshold`.  Results are ordered by score descending.
1585    ///
1586    /// # Example
1587    ///
1588    /// ```no_run
1589    /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
1590    /// use stygian_browser::similarity::SimilarityConfig;
1591    /// use std::time::Duration;
1592    ///
1593    /// # async fn run() -> stygian_browser::error::Result<()> {
1594    /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1595    /// let handle = pool.acquire().await?;
1596    /// let mut page = handle.browser().expect("valid browser").new_page().await?;
1597    /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
1598    ///
1599    /// let nodes = page.query_selector_all(".price").await?;
1600    /// if let Some(reference) = nodes.into_iter().next() {
1601    ///     let similar = page.find_similar(&reference, SimilarityConfig::default()).await?;
1602    ///     for m in &similar {
1603    ///         println!("score={:.2}", m.score);
1604    ///     }
1605    /// }
1606    /// # Ok(())
1607    /// # }
1608    /// ```
1609    ///
1610    /// # Errors
1611    ///
1612    /// Returns [`BrowserError::StaleNode`] when `reference` is invalid, or
1613    /// [`BrowserError::ScriptExecutionFailed`] if a scoring script fails.
1614    pub async fn find_similar(
1615        &self,
1616        reference: &NodeHandle,
1617        config: crate::similarity::SimilarityConfig,
1618    ) -> Result<Vec<crate::similarity::SimilarMatch>> {
1619        use crate::similarity::{SimilarMatch, jaccard_weighted};
1620
1621        let ref_fp = reference.fingerprint().await?;
1622        let candidates = self.query_selector_all("*").await?;
1623
1624        let mut matches: Vec<SimilarMatch> = Vec::new();
1625        for node in candidates {
1626            if let Ok(cand_fp) = node.fingerprint().await {
1627                let score = jaccard_weighted(&ref_fp, &cand_fp);
1628                if score >= config.threshold {
1629                    matches.push(SimilarMatch { node, score });
1630                }
1631            }
1632            // Stale / detached nodes are silently skipped.
1633        }
1634
1635        matches.sort_by(|a, b| {
1636            b.score
1637                .partial_cmp(&a.score)
1638                .unwrap_or(std::cmp::Ordering::Equal)
1639        });
1640
1641        if config.max_results > 0 {
1642            matches.truncate(config.max_results);
1643        }
1644
1645        Ok(matches)
1646    }
1647}
1648
1649impl Drop for PageHandle {
1650    fn drop(&mut self) {
1651        warn!("PageHandle dropped without explicit close(); spawning cleanup task");
1652        // chromiumoxide Page does not implement close on Drop, so we spawn
1653        // a fire-and-forget task. The page ref is already owned; we need to
1654        // swap it out. We clone the Page handle (it's Arc-backed internally).
1655        let page = self.page.clone();
1656        tokio::spawn(async move {
1657            let _ = page.close().await;
1658        });
1659    }
1660}
1661
1662// ─── Tests ────────────────────────────────────────────────────────────────────
1663
1664#[cfg(test)]
1665mod tests {
1666    use super::*;
1667
1668    #[test]
1669    fn resource_filter_block_media_blocks_image() {
1670        let filter = ResourceFilter::block_media();
1671        assert!(filter.should_block("Image"));
1672        assert!(filter.should_block("Font"));
1673        assert!(filter.should_block("Stylesheet"));
1674        assert!(filter.should_block("Media"));
1675        assert!(!filter.should_block("Script"));
1676        assert!(!filter.should_block("XHR"));
1677    }
1678
1679    #[test]
1680    fn resource_filter_case_insensitive() {
1681        let filter = ResourceFilter::block_images_and_fonts();
1682        assert!(filter.should_block("image")); // lowercase
1683        assert!(filter.should_block("IMAGE")); // uppercase
1684        assert!(!filter.should_block("Stylesheet"));
1685    }
1686
1687    #[test]
1688    fn resource_filter_builder_chain() {
1689        let filter = ResourceFilter::default()
1690            .block(ResourceType::Image)
1691            .block(ResourceType::Font);
1692        assert!(filter.should_block("Image"));
1693        assert!(filter.should_block("Font"));
1694        assert!(!filter.should_block("Stylesheet"));
1695    }
1696
1697    #[test]
1698    fn resource_filter_dedup_block() {
1699        let filter = ResourceFilter::default()
1700            .block(ResourceType::Image)
1701            .block(ResourceType::Image); // duplicate
1702        assert_eq!(filter.blocked.len(), 1);
1703    }
1704
1705    #[test]
1706    fn resource_filter_is_empty_when_default() {
1707        assert!(ResourceFilter::default().is_empty());
1708        assert!(!ResourceFilter::block_media().is_empty());
1709    }
1710
1711    #[test]
1712    fn wait_until_selector_stores_string() {
1713        let w = WaitUntil::Selector("#foo".to_string());
1714        assert!(matches!(w, WaitUntil::Selector(ref s) if s == "#foo"));
1715    }
1716
1717    #[test]
1718    fn resource_type_cdp_str() {
1719        assert_eq!(ResourceType::Image.as_cdp_str(), "Image");
1720        assert_eq!(ResourceType::Font.as_cdp_str(), "Font");
1721        assert_eq!(ResourceType::Stylesheet.as_cdp_str(), "Stylesheet");
1722        assert_eq!(ResourceType::Media.as_cdp_str(), "Media");
1723    }
1724
1725    /// `PageHandle` must be `Send + Sync` for use across thread boundaries.
1726    #[test]
1727    fn page_handle_is_send_sync() {
1728        fn assert_send<T: Send>() {}
1729        fn assert_sync<T: Sync>() {}
1730        assert_send::<PageHandle>();
1731        assert_sync::<PageHandle>();
1732    }
1733
1734    /// The status-code sentinel (0 = "not yet captured") and the conversion to
1735    /// `Option<u16>` are pure-logic invariants testable without a live browser.
1736    #[test]
1737    fn status_code_sentinel_zero_maps_to_none() {
1738        use std::sync::atomic::{AtomicU16, Ordering};
1739        let atom = AtomicU16::new(0);
1740        let code = atom.load(Ordering::Acquire);
1741        assert_eq!(if code == 0 { None } else { Some(code) }, None::<u16>);
1742    }
1743
1744    #[test]
1745    fn status_code_non_zero_maps_to_some() {
1746        use std::sync::atomic::{AtomicU16, Ordering};
1747        for &expected in &[200u16, 301, 404, 503] {
1748            let atom = AtomicU16::new(expected);
1749            let code = atom.load(Ordering::Acquire);
1750            assert_eq!(if code == 0 { None } else { Some(code) }, Some(expected));
1751        }
1752    }
1753
1754    // ── NodeHandle pure-logic tests ───────────────────────────────────────────
1755
1756    /// `attr_map` relies on `chunks_exact(2)` — verify the pairing logic is
1757    /// correct without a live browser by exercising it directly.
1758    #[test]
1759    fn attr_map_chunking_pairs_correctly() {
1760        let flat = [
1761            "id".to_string(),
1762            "main".to_string(),
1763            "data-ux".to_string(),
1764            "Section".to_string(),
1765            "class".to_string(),
1766            "container".to_string(),
1767        ];
1768        let mut map = std::collections::HashMap::with_capacity(flat.len() / 2);
1769        for pair in flat.chunks_exact(2) {
1770            if let [name, value] = pair {
1771                map.insert(name.clone(), value.clone());
1772            }
1773        }
1774        assert_eq!(map.get("id").map(String::as_str), Some("main"));
1775        assert_eq!(map.get("data-ux").map(String::as_str), Some("Section"));
1776        assert_eq!(map.get("class").map(String::as_str), Some("container"));
1777        assert_eq!(map.len(), 3);
1778    }
1779
1780    /// Odd-length flat attribute lists (malformed CDP response) are handled
1781    /// gracefully — the trailing element is silently ignored.
1782    #[test]
1783    fn attr_map_chunking_ignores_odd_trailing() {
1784        let flat = ["orphan".to_string()]; // no value
1785        let mut map = std::collections::HashMap::new();
1786        for pair in flat.chunks_exact(2) {
1787            if let [name, value] = pair {
1788                map.insert(name.clone(), value.clone());
1789            }
1790        }
1791        assert!(map.is_empty());
1792    }
1793
1794    /// Empty flat list → empty map.
1795    #[test]
1796    fn attr_map_chunking_empty_input() {
1797        let flat: Vec<String> = vec![];
1798        let map: std::collections::HashMap<String, String> = flat
1799            .chunks_exact(2)
1800            .filter_map(|pair| {
1801                if let [name, value] = pair {
1802                    Some((name.clone(), value.clone()))
1803                } else {
1804                    None
1805                }
1806            })
1807            .collect();
1808        assert!(map.is_empty());
1809    }
1810
1811    /// `ancestors` JSON parsing: valid input round-trips correctly.
1812    #[test]
1813    fn ancestors_json_parse_round_trip() -> std::result::Result<(), serde_json::Error> {
1814        let json = r#"["p","article","body","html"]"#;
1815        let result: Vec<String> = serde_json::from_str(json)?;
1816        assert_eq!(result, ["p", "article", "body", "html"]);
1817        Ok(())
1818    }
1819
1820    /// `ancestors` JSON parsing: empty array (no parent) is fine.
1821    #[test]
1822    fn ancestors_json_parse_empty() -> std::result::Result<(), serde_json::Error> {
1823        let json = "[]";
1824        let result: Vec<String> = serde_json::from_str(json)?;
1825        assert!(result.is_empty());
1826        Ok(())
1827    }
1828
1829    // ── Traversal selector suffix tests ──────────────────────────────────────
1830
1831    /// A `StaleNode` error whose selector includes a traversal suffix (e.g.
1832    /// `"div::parent"`) must surface that suffix in its `Display` output so
1833    /// callers can locate the failed traversal in logs.
1834    #[test]
1835    fn traversal_selector_suffix_in_stale_error() {
1836        let e = crate::error::BrowserError::StaleNode {
1837            selector: "div::parent".to_string(),
1838        };
1839        let msg = e.to_string();
1840        assert!(
1841            msg.contains("div::parent"),
1842            "StaleNode display must include the full selector; got: {msg}"
1843        );
1844    }
1845
1846    /// Same check for the `::next` suffix produced by `next_sibling()`.
1847    #[test]
1848    fn traversal_next_suffix_in_stale_error() {
1849        let e = crate::error::BrowserError::StaleNode {
1850            selector: "li.price::next".to_string(),
1851        };
1852        assert!(e.to_string().contains("li.price::next"));
1853    }
1854
1855    /// Same check for the `::prev` suffix produced by `previous_sibling()`.
1856    #[test]
1857    fn traversal_prev_suffix_in_stale_error() {
1858        let e = crate::error::BrowserError::StaleNode {
1859            selector: "td.label::prev".to_string(),
1860        };
1861        assert!(e.to_string().contains("td.label::prev"));
1862    }
1863}