stygian_browser/page.rs
1//!
2//! ## Resource blocking
3//!
4//! ## Wait strategies
5//!
6//! [`PageHandle`] exposes three wait strategies via [`WaitUntil`]:
7//! - `DomContentLoaded` — fires when the HTML is parsed
8//!
9//! # Example
10//!
11//! ```no_run
12//! use stygian_browser::{BrowserPool, BrowserConfig};
13//! use stygian_browser::page::{ResourceFilter, WaitUntil};
14//! use std::time::Duration;
15//!
16//! # async fn run() -> stygian_browser::error::Result<()> {
17//! let pool = BrowserPool::new(BrowserConfig::default()).await?;
18//! let handle = pool.acquire().await?;
19//!
20//! let mut page = handle.browser().expect("valid browser").new_page().await?;
21//! page.set_resource_filter(ResourceFilter::block_media()).await?;
22//! page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
23//! let title = page.title().await?;
24//! println!("title: {title}");
25//! handle.release().await;
26//! # Ok(())
27//! # }
28//! ```
29
30use std::collections::HashMap;
31use std::sync::{
32 Arc,
33 atomic::{AtomicU16, Ordering},
34};
35use std::time::Duration;
36
37use chromiumoxide::Page;
38use tokio::time::timeout;
39use tracing::{debug, warn};
40
41use crate::error::{BrowserError, Result};
42
43// ─── ResourceType ─────────────────────────────────────────────────────────────
44
45/// CDP resource types that can be intercepted.
46#[derive(Debug, Clone, PartialEq, Eq)]
47pub enum ResourceType {
48 /// `<img>`, `<picture>`, background images
49 Image,
50 /// Web fonts loaded via CSS `@font-face`
51 Font,
52 /// External CSS stylesheets
53 Stylesheet,
54 /// Media files (audio/video)
55 Media,
56}
57
58impl ResourceType {
59 pub const fn as_cdp_str(&self) -> &'static str {
60 match self {
61 Self::Image => "Image",
62 Self::Font => "Font",
63 Self::Stylesheet => "Stylesheet",
64 Self::Media => "Media",
65 }
66 }
67}
68
69// ─── ResourceFilter ───────────────────────────────────────────────────────────
70
71///
72/// # Example
73///
74/// ```
75/// use stygian_browser::page::ResourceFilter;
76/// let filter = ResourceFilter::block_media();
77/// assert!(filter.should_block("Image"));
78/// ```
79#[derive(Debug, Clone, Default)]
80pub struct ResourceFilter {
81 blocked: Vec<ResourceType>,
82}
83
84impl ResourceFilter {
85 /// Block all media resources (images, fonts, CSS, audio/video).
86 pub fn block_media() -> Self {
87 Self {
88 blocked: vec![
89 ResourceType::Image,
90 ResourceType::Font,
91 ResourceType::Stylesheet,
92 ResourceType::Media,
93 ],
94 }
95 }
96
97 pub fn block_images_and_fonts() -> Self {
98 Self {
99 blocked: vec![ResourceType::Image, ResourceType::Font],
100 }
101 }
102
103 #[must_use]
104 pub fn block(mut self, resource: ResourceType) -> Self {
105 if !self.blocked.contains(&resource) {
106 self.blocked.push(resource);
107 }
108 self
109 }
110
111 pub fn should_block(&self, cdp_type: &str) -> bool {
112 self.blocked
113 .iter()
114 .any(|r| r.as_cdp_str().eq_ignore_ascii_case(cdp_type))
115 }
116
117 pub const fn is_empty(&self) -> bool {
118 self.blocked.is_empty()
119 }
120}
121
122// ─── WaitUntil ────────────────────────────────────────────────────────────────
123
124///
125/// # Example
126///
127/// ```
128/// use stygian_browser::page::WaitUntil;
129/// ```
130/// Specifies what condition to wait for after a page navigation.
131#[derive(Debug, Clone)]
132pub enum WaitUntil {
133 /// Fires when the initial HTML is fully parsed, without waiting for
134 /// subresources such as images and stylesheets to finish loading.
135 DomContentLoaded,
136 NetworkIdle,
137 Selector(String),
138}
139
140// ─── NodeHandle ───────────────────────────────────────────────────────────────
141
142///
143/// more CDP `Runtime.callFunctionOn` calls against the held V8 remote object
144/// reference — no HTML serialisation occurs.
145///
146/// A handle becomes **stale** after page navigation or if the underlying DOM
147/// node is removed. Stale calls return [`BrowserError::StaleNode`] so callers
148/// can distinguish them from other CDP failures.
149///
150/// # Example
151///
152/// ```no_run
153/// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
154/// use std::time::Duration;
155///
156/// # async fn run() -> stygian_browser::error::Result<()> {
157/// let pool = BrowserPool::new(BrowserConfig::default()).await?;
158/// let handle = pool.acquire().await?;
159/// let mut page = handle.browser().expect("valid browser").new_page().await?;
160/// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
161/// # let nodes = page.query_selector_all("a").await?;
162/// # for node in &nodes {
163/// let href = node.attr("href").await?;
164/// let text = node.text_content().await?;
165/// println!("{text}: {href:?}");
166/// # }
167/// # Ok(())
168/// # }
169/// ```
170pub struct NodeHandle {
171 element: chromiumoxide::element::Element,
172 /// Shared via `Arc<str>` so all handles from a single query reuse the
173 /// same allocation rather than cloning a `String` per node.
174 selector: Arc<str>,
175 cdp_timeout: Duration,
176 /// during DOM traversal (parent / sibling navigation).
177 page: chromiumoxide::Page,
178}
179
180impl NodeHandle {
181 /// Return a single attribute value, or `None` if the attribute is absent.
182 ///
183 /// Issues one `Runtime.callFunctionOn` CDP call (`el.getAttribute(name)`).
184 ///
185 /// # Errors
186 ///
187 /// invalidated, or [`BrowserError::Timeout`] / [`BrowserError::CdpError`]
188 /// on transport-level failures.
189 pub async fn attr(&self, name: &str) -> Result<Option<String>> {
190 timeout(self.cdp_timeout, self.element.attribute(name))
191 .await
192 .map_err(|_| BrowserError::Timeout {
193 operation: "NodeHandle::attr".to_string(),
194 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
195 })?
196 .map_err(|e| self.cdp_err_or_stale(&e, "attr"))
197 }
198
199 /// Return all attributes as a `HashMap<name, value>` in a **single**
200 /// CDP round-trip.
201 ///
202 /// Uses `DOM.getAttributes` (via the chromiumoxide `attributes()` API)
203 /// which returns a flat `[name, value, name, value, …]` list from the node
204 /// description — no per-attribute calls are needed.
205 ///
206 /// # Errors
207 ///
208 /// invalidated.
209 pub async fn attr_map(&self) -> Result<HashMap<String, String>> {
210 let flat = timeout(self.cdp_timeout, self.element.attributes())
211 .await
212 .map_err(|_| BrowserError::Timeout {
213 operation: "NodeHandle::attr_map".to_string(),
214 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
215 })?
216 .map_err(|e| self.cdp_err_or_stale(&e, "attr_map"))?;
217
218 let mut map = HashMap::with_capacity(flat.len() / 2);
219 for pair in flat.chunks_exact(2) {
220 if let [name, value] = pair {
221 map.insert(name.clone(), value.clone());
222 }
223 }
224 Ok(map)
225 }
226
227 /// Return the element's `textContent` (all text inside, no markup).
228 ///
229 /// Reads the DOM `textContent` property via a single JS eval — this is the
230 /// raw text concatenation of all descendant text nodes, independent of
231 /// layout or visibility (unlike `innerText`).
232 ///
233 ///
234 /// # Errors
235 ///
236 /// invalidated.
237 pub async fn text_content(&self) -> Result<String> {
238 let returns = timeout(
239 self.cdp_timeout,
240 self.element
241 .call_js_fn(r"function() { return this.textContent ?? ''; }", true),
242 )
243 .await
244 .map_err(|_| BrowserError::Timeout {
245 operation: "NodeHandle::text_content".to_string(),
246 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
247 })?
248 .map_err(|e| self.cdp_err_or_stale(&e, "text_content"))?;
249
250 Ok(returns
251 .result
252 .value
253 .as_ref()
254 .and_then(|v| v.as_str())
255 .unwrap_or("")
256 .to_string())
257 }
258
259 /// Return the element's `innerHTML`.
260 ///
261 ///
262 /// # Errors
263 ///
264 /// invalidated.
265 pub async fn inner_html(&self) -> Result<String> {
266 timeout(self.cdp_timeout, self.element.inner_html())
267 .await
268 .map_err(|_| BrowserError::Timeout {
269 operation: "NodeHandle::inner_html".to_string(),
270 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
271 })?
272 .map_err(|e| self.cdp_err_or_stale(&e, "inner_html"))
273 .map(Option::unwrap_or_default)
274 }
275
276 /// Return the element's `outerHTML`.
277 ///
278 ///
279 /// # Errors
280 ///
281 /// invalidated.
282 pub async fn outer_html(&self) -> Result<String> {
283 timeout(self.cdp_timeout, self.element.outer_html())
284 .await
285 .map_err(|_| BrowserError::Timeout {
286 operation: "NodeHandle::outer_html".to_string(),
287 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
288 })?
289 .map_err(|e| self.cdp_err_or_stale(&e, "outer_html"))
290 .map(Option::unwrap_or_default)
291 }
292
293 ///
294 /// Executes a single `Runtime.callFunctionOn` JavaScript function that
295 /// walks `parentElement` and collects tag names — no repeated CDP calls.
296 ///
297 /// ```text
298 /// ["p", "article", "body", "html"]
299 /// ```
300 ///
301 /// # Errors
302 ///
303 /// invalidated, or [`BrowserError::ScriptExecutionFailed`] when CDP
304 pub async fn ancestors(&self) -> Result<Vec<String>> {
305 let returns = timeout(
306 self.cdp_timeout,
307 self.element.call_js_fn(
308 r"function() {
309 const a = [];
310 let n = this.parentElement;
311 while (n) { a.push(n.tagName.toLowerCase()); n = n.parentElement; }
312 return a;
313 }",
314 true,
315 ),
316 )
317 .await
318 .map_err(|_| BrowserError::Timeout {
319 operation: "NodeHandle::ancestors".to_string(),
320 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
321 })?
322 .map_err(|e| self.cdp_err_or_stale(&e, "ancestors"))?;
323
324 // With returnByValue=true and an array return, CDP delivers the value
325 // as a JSON array directly — no JSON.stringify/re-parse needed.
326 // A missing or wrong-type value indicates an unexpected CDP failure.
327 let arr = returns
328 .result
329 .value
330 .as_ref()
331 .and_then(|v| v.as_array())
332 .ok_or_else(|| BrowserError::ScriptExecutionFailed {
333 script: "NodeHandle::ancestors".to_string(),
334 reason: "CDP returned no value or a non-array value for ancestors()".to_string(),
335 })?;
336
337 arr.iter()
338 .map(|v| {
339 v.as_str().map(ToString::to_string).ok_or_else(|| {
340 BrowserError::ScriptExecutionFailed {
341 script: "NodeHandle::ancestors".to_string(),
342 reason: format!("ancestor entry is not a string: {v}"),
343 }
344 })
345 })
346 .collect()
347 }
348
349 ///
350 ///
351 ///
352 /// # Errors
353 ///
354 /// invalidated, or [`BrowserError::CdpError`] on transport failure.
355 pub async fn children_matching(&self, selector: &str) -> Result<Vec<Self>> {
356 let elements = timeout(self.cdp_timeout, self.element.find_elements(selector))
357 .await
358 .map_err(|_| BrowserError::Timeout {
359 operation: "NodeHandle::children_matching".to_string(),
360 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
361 })?
362 .map_err(|e| self.cdp_err_or_stale(&e, "children_matching"))?;
363
364 let selector_arc: Arc<str> = Arc::from(selector);
365 Ok(elements
366 .into_iter()
367 .map(|el| Self {
368 element: el,
369 selector: selector_arc.clone(),
370 cdp_timeout: self.cdp_timeout,
371 page: self.page.clone(),
372 })
373 .collect())
374 }
375
376 /// Return the immediate parent element, or `None` if this element has no
377 /// parent (i.e. it is the document root).
378 ///
379 /// Issues a single `Runtime.callFunctionOn` CDP call that temporarily tags
380 /// the parent element with a unique attribute, then resolves it via a
381 /// CSS attribute selector.
382 ///
383 /// # Errors
384 ///
385 /// Returns an error if the CDP call fails or the page handle is invalidated.
386 ///
387 /// # Example
388 ///
389 /// ```no_run
390 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
391 /// use std::time::Duration;
392 ///
393 /// # async fn run() -> stygian_browser::error::Result<()> {
394 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
395 /// let handle = pool.acquire().await?;
396 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
397 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
398 /// # let nodes = page.query_selector_all("a").await?;
399 /// if let Some(parent) = nodes[0].parent().await? {
400 /// let html = parent.outer_html().await?;
401 /// println!("parent: {}", &html[..html.len().min(80)]);
402 /// }
403 /// # Ok(())
404 /// # }
405 /// ```
406 pub async fn parent(&self) -> Result<Option<Self>> {
407 let attr = format!(
408 "data-stygian-t-{}",
409 ulid::Ulid::new().to_string().to_lowercase()
410 );
411 let js = format!(
412 "function() {{ \
413 var t = this.parentElement; \
414 if (!t) {{ return false; }} \
415 t.setAttribute('{attr}', '1'); \
416 return true; \
417 }}"
418 );
419 self.call_traversal(&js, &attr, "parent").await
420 }
421
422 /// Return the next element sibling, or `None` if this element is the last
423 /// child of its parent.
424 ///
425 /// Uses `nextElementSibling` (skips text/comment nodes).
426 ///
427 /// # Errors
428 ///
429 /// invalidated.
430 ///
431 /// # Example
432 ///
433 /// ```no_run
434 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
435 /// use std::time::Duration;
436 ///
437 /// # async fn run() -> stygian_browser::error::Result<()> {
438 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
439 /// let handle = pool.acquire().await?;
440 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
441 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
442 /// # let nodes = page.query_selector_all("a").await?;
443 /// if let Some(next) = nodes[0].next_sibling().await? {
444 /// println!("next sibling: {}", next.text_content().await?);
445 /// }
446 /// # Ok(())
447 /// # }
448 /// ```
449 pub async fn next_sibling(&self) -> Result<Option<Self>> {
450 let attr = format!(
451 "data-stygian-t-{}",
452 ulid::Ulid::new().to_string().to_lowercase()
453 );
454 let js = format!(
455 "function() {{ \
456 var t = this.nextElementSibling; \
457 if (!t) {{ return false; }} \
458 t.setAttribute('{attr}', '1'); \
459 return true; \
460 }}"
461 );
462 self.call_traversal(&js, &attr, "next").await
463 }
464
465 /// Return the previous element sibling, or `None` if this element is the
466 /// first child of its parent.
467 ///
468 /// Uses `previousElementSibling` (skips text/comment nodes).
469 ///
470 /// # Errors
471 ///
472 /// invalidated.
473 ///
474 /// # Example
475 ///
476 /// ```no_run
477 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
478 /// use std::time::Duration;
479 ///
480 /// # async fn run() -> stygian_browser::error::Result<()> {
481 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
482 /// let handle = pool.acquire().await?;
483 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
484 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
485 /// # let nodes = page.query_selector_all("a").await?;
486 /// if let Some(prev) = nodes[1].previous_sibling().await? {
487 /// println!("prev sibling: {}", prev.text_content().await?);
488 /// }
489 /// # Ok(())
490 /// # }
491 /// ```
492 pub async fn previous_sibling(&self) -> Result<Option<Self>> {
493 let attr = format!(
494 "data-stygian-t-{}",
495 ulid::Ulid::new().to_string().to_lowercase()
496 );
497 let js = format!(
498 "function() {{ \
499 var t = this.previousElementSibling; \
500 if (!t) {{ return false; }} \
501 t.setAttribute('{attr}', '1'); \
502 return true; \
503 }}"
504 );
505 self.call_traversal(&js, &attr, "prev").await
506 }
507
508 /// Shared traversal implementation used by [`parent`], [`next_sibling`],
509 /// and [`previous_sibling`].
510 ///
511 /// The caller provides a JS function that:
512 /// 1. Computes the traversal target (for example, the parent, next
513 /// sibling, or previous sibling) and stores it in a local variable.
514 /// 2. If the target is non-null, sets a unique attribute (`attr_name`)
515 /// on it and returns `true`.
516 /// 3. Returns `false` when the target is null (no such neighbour).
517 ///
518 /// This helper then resolves the tagged element from the document root,
519 /// removes the temporary attribute, and wraps the result in a
520 /// `NodeHandle`.
521 ///
522 /// [`parent`]: Self::parent
523 /// [`next_sibling`]: Self::next_sibling
524 /// [`previous_sibling`]: Self::previous_sibling
525 async fn call_traversal(
526 &self,
527 js_fn: &str,
528 attr_name: &str,
529 selector_suffix: &str,
530 ) -> Result<Option<Self>> {
531 // Step 1: Run the JS that tags the target element and reports null/non-null.
532 let op_tag = format!("NodeHandle::{selector_suffix}::tag");
533 let returns = timeout(self.cdp_timeout, self.element.call_js_fn(js_fn, false))
534 .await
535 .map_err(|_| BrowserError::Timeout {
536 operation: op_tag.clone(),
537 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
538 })?
539 .map_err(|e| self.cdp_err_or_stale(&e, selector_suffix))?;
540
541 // JS returns false → no such neighbour.
542 let has_target = returns
543 .result
544 .value
545 .as_ref()
546 .and_then(serde_json::Value::as_bool)
547 .unwrap_or(false);
548 if !has_target {
549 return Ok(None);
550 }
551
552 let css = format!("[{attr_name}]");
553 let op_resolve = format!("NodeHandle::{selector_suffix}::resolve");
554 let element = timeout(self.cdp_timeout, self.page.find_element(css))
555 .await
556 .map_err(|_| BrowserError::Timeout {
557 operation: op_resolve.clone(),
558 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
559 })?
560 .map_err(|e| BrowserError::CdpError {
561 operation: op_resolve,
562 message: e.to_string(),
563 })?;
564
565 // is non-fatal — it leaves a harmless stale attribute in the DOM).
566 let cleanup = format!("function() {{ this.removeAttribute('{attr_name}'); }}");
567 let _ = element.call_js_fn(cleanup, false).await;
568
569 let new_selector: Arc<str> =
570 Arc::from(format!("{}::{selector_suffix}", self.selector).as_str());
571 Ok(Some(Self {
572 element,
573 selector: new_selector,
574 cdp_timeout: self.cdp_timeout,
575 page: self.page.clone(),
576 }))
577 }
578
579 /// (when the remote object reference has been invalidated) or
580 fn cdp_err_or_stale(
581 &self,
582 err: &chromiumoxide::error::CdpError,
583 operation: &str,
584 ) -> BrowserError {
585 let msg = err.to_string();
586 if msg.contains("Cannot find object with id")
587 || msg.contains("context with specified id")
588 || msg.contains("Cannot find context")
589 {
590 BrowserError::StaleNode {
591 selector: self.selector.to_string(),
592 }
593 } else {
594 BrowserError::CdpError {
595 operation: operation.to_string(),
596 message: msg,
597 }
598 }
599 }
600}
601
602// ─── PageHandle ───────────────────────────────────────────────────────────────
603
604///
605///
606/// # Example
607///
608/// ```no_run
609/// use stygian_browser::{BrowserPool, BrowserConfig};
610/// use stygian_browser::page::WaitUntil;
611/// use std::time::Duration;
612///
613/// # async fn run() -> stygian_browser::error::Result<()> {
614/// let pool = BrowserPool::new(BrowserConfig::default()).await?;
615/// let handle = pool.acquire().await?;
616/// let mut page = handle.browser().expect("valid browser").new_page().await?;
617/// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
618/// let html = page.content().await?;
619/// drop(page); // closes the tab
620/// handle.release().await;
621/// # Ok(())
622/// # }
623/// ```
624pub struct PageHandle {
625 page: Page,
626 cdp_timeout: Duration,
627 /// HTTP status code of the most recent main-frame navigation, or `0` if not
628 last_status_code: Arc<AtomicU16>,
629 /// Background task processing `Fetch.requestPaused` events. Aborted and
630 /// replaced each time `set_resource_filter` is called.
631 resource_filter_task: Option<tokio::task::JoinHandle<()>>,
632}
633
634impl PageHandle {
635 /// Wrap a raw chromiumoxide [`Page`] in a handle.
636 pub(crate) fn new(page: Page, cdp_timeout: Duration) -> Self {
637 Self {
638 page,
639 cdp_timeout,
640 last_status_code: Arc::new(AtomicU16::new(0)),
641 resource_filter_task: None,
642 }
643 }
644
645 ///
646 /// # Errors
647 ///
648 /// the CDP call fails.
649 pub async fn navigate(
650 &mut self,
651 url: &str,
652 condition: WaitUntil,
653 nav_timeout: Duration,
654 ) -> Result<()> {
655 self.setup_status_capture().await;
656 timeout(
657 nav_timeout,
658 self.navigate_inner(url, condition, nav_timeout),
659 )
660 .await
661 .map_err(|_| BrowserError::NavigationFailed {
662 url: url.to_string(),
663 reason: format!("navigation timed out after {nav_timeout:?}"),
664 })?
665 }
666
667 /// Reset the last status code and wire up the `Network.responseReceived`
668 /// so that a missing network domain never blocks navigation.
669 async fn setup_status_capture(&self) {
670 use chromiumoxide::cdp::browser_protocol::network::{
671 EventResponseReceived, ResourceType as NetworkResourceType,
672 };
673 use futures::StreamExt;
674
675 // Reset so a stale code is not returned if the new navigation fails
676 self.last_status_code.store(0, Ordering::Release);
677
678 let page_for_listener = self.page.clone();
679 let status_capture = Arc::clone(&self.last_status_code);
680 match page_for_listener
681 .event_listener::<EventResponseReceived>()
682 .await
683 {
684 Ok(mut stream) => {
685 tokio::spawn(async move {
686 while let Some(event) = stream.next().await {
687 if event.r#type == NetworkResourceType::Document {
688 let code = u16::try_from(event.response.status).unwrap_or(0);
689 if code > 0 {
690 status_capture.store(code, Ordering::Release);
691 }
692 break;
693 }
694 }
695 });
696 }
697 Err(e) => warn!("status-code capture unavailable: {e}"),
698 }
699 }
700
701 /// described in issue #7.
702 async fn navigate_inner(
703 &self,
704 url: &str,
705 condition: WaitUntil,
706 nav_timeout: Duration,
707 ) -> Result<()> {
708 use chromiumoxide::cdp::browser_protocol::page::{
709 EventDomContentEventFired, EventLoadEventFired,
710 };
711 use futures::StreamExt;
712
713 let url_owned = url.to_string();
714
715 let mut dom_events = match &condition {
716 WaitUntil::DomContentLoaded => Some(
717 self.page
718 .event_listener::<EventDomContentEventFired>()
719 .await
720 .map_err(|e| BrowserError::NavigationFailed {
721 url: url_owned.clone(),
722 reason: e.to_string(),
723 })?,
724 ),
725 _ => None,
726 };
727
728 let mut load_events = match &condition {
729 WaitUntil::NetworkIdle => Some(
730 self.page
731 .event_listener::<EventLoadEventFired>()
732 .await
733 .map_err(|e| BrowserError::NavigationFailed {
734 url: url_owned.clone(),
735 reason: e.to_string(),
736 })?,
737 ),
738 _ => None,
739 };
740
741 let inflight = if matches!(condition, WaitUntil::NetworkIdle) {
742 Some(self.subscribe_inflight_counter().await)
743 } else {
744 None
745 };
746
747 self.page
748 .goto(url)
749 .await
750 .map_err(|e| BrowserError::NavigationFailed {
751 url: url_owned.clone(),
752 reason: e.to_string(),
753 })?;
754
755 match &condition {
756 WaitUntil::DomContentLoaded => {
757 if let Some(ref mut events) = dom_events {
758 let _ = events.next().await;
759 }
760 }
761 WaitUntil::NetworkIdle => {
762 if let Some(ref mut events) = load_events {
763 let _ = events.next().await;
764 }
765 if let Some(ref counter) = inflight {
766 Self::wait_network_idle(counter).await;
767 }
768 }
769 WaitUntil::Selector(css) => {
770 self.wait_for_selector(css, nav_timeout).await?;
771 }
772 }
773 Ok(())
774 }
775
776 /// Spawn three detached tasks that maintain a signed in-flight request
777 /// counter via `Network.requestWillBeSent` (+1) and
778 /// `Network.loadingFinished`/`Network.loadingFailed` (−1 each).
779 async fn subscribe_inflight_counter(&self) -> Arc<std::sync::atomic::AtomicI32> {
780 use std::sync::atomic::AtomicI32;
781
782 use chromiumoxide::cdp::browser_protocol::network::{
783 EventLoadingFailed, EventLoadingFinished, EventRequestWillBeSent,
784 };
785 use futures::StreamExt;
786
787 let counter: Arc<AtomicI32> = Arc::new(AtomicI32::new(0));
788 let pairs: [(Arc<AtomicI32>, i32); 3] = [
789 (Arc::clone(&counter), 1),
790 (Arc::clone(&counter), -1),
791 (Arc::clone(&counter), -1),
792 ];
793 let [p1, p2, p3] = [self.page.clone(), self.page.clone(), self.page.clone()];
794
795 macro_rules! spawn_tracker {
796 ($page:expr, $event:ty, $c:expr, $delta:expr) => {
797 match $page.event_listener::<$event>().await {
798 Ok(mut s) => {
799 let c = $c;
800 let d = $delta;
801 tokio::spawn(async move {
802 while s.next().await.is_some() {
803 c.fetch_add(d, Ordering::Relaxed);
804 }
805 });
806 }
807 Err(e) => warn!("network-idle tracker unavailable: {e}"),
808 }
809 };
810 }
811
812 let [(c1, d1), (c2, d2), (c3, d3)] = pairs;
813 spawn_tracker!(p1, EventRequestWillBeSent, c1, d1);
814 spawn_tracker!(p2, EventLoadingFinished, c2, d2);
815 spawn_tracker!(p3, EventLoadingFailed, c3, d3);
816
817 counter
818 }
819
820 async fn wait_network_idle(counter: &Arc<std::sync::atomic::AtomicI32>) {
821 const IDLE_THRESHOLD: i32 = 2;
822 const SETTLE: Duration = Duration::from_millis(500);
823 loop {
824 if counter.load(Ordering::Relaxed) <= IDLE_THRESHOLD {
825 tokio::time::sleep(SETTLE).await;
826 if counter.load(Ordering::Relaxed) <= IDLE_THRESHOLD {
827 break;
828 }
829 } else {
830 tokio::time::sleep(Duration::from_millis(50)).await;
831 }
832 }
833 }
834
835 ///
836 /// # Errors
837 ///
838 /// within the given timeout.
839 pub async fn wait_for_selector(&self, selector: &str, wait_timeout: Duration) -> Result<()> {
840 let selector_owned = selector.to_string();
841 let poll = async {
842 loop {
843 if self.page.find_element(selector_owned.clone()).await.is_ok() {
844 return Ok(());
845 }
846 tokio::time::sleep(Duration::from_millis(100)).await;
847 }
848 };
849
850 timeout(wait_timeout, poll)
851 .await
852 .map_err(|_| BrowserError::NavigationFailed {
853 url: String::new(),
854 reason: format!("selector '{selector_owned}' not found within {wait_timeout:?}"),
855 })?
856 }
857
858 ///
859 /// Enables `Fetch` interception and spawns a background task that continues
860 /// allowed requests and fails blocked ones with `BlockedByClient`. Any
861 /// previously set filter task is cancelled first.
862 ///
863 /// # Errors
864 ///
865 pub async fn set_resource_filter(&mut self, filter: ResourceFilter) -> Result<()> {
866 use chromiumoxide::cdp::browser_protocol::fetch::{
867 ContinueRequestParams, EnableParams, EventRequestPaused, FailRequestParams,
868 RequestPattern,
869 };
870 use chromiumoxide::cdp::browser_protocol::network::ErrorReason;
871 use futures::StreamExt as _;
872
873 if filter.is_empty() {
874 return Ok(());
875 }
876
877 // Cancel any previously running filter task.
878 if let Some(task) = self.resource_filter_task.take() {
879 task.abort();
880 }
881
882 let pattern = RequestPattern::builder().url_pattern("*").build();
883 let params = EnableParams::builder()
884 .patterns(vec![pattern])
885 .handle_auth_requests(false)
886 .build();
887
888 timeout(self.cdp_timeout, self.page.execute::<EnableParams>(params))
889 .await
890 .map_err(|_| BrowserError::Timeout {
891 operation: "Fetch.enable".to_string(),
892 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
893 })?
894 .map_err(|e| BrowserError::CdpError {
895 operation: "Fetch.enable".to_string(),
896 message: e.to_string(),
897 })?;
898
899 // is never blocked. Without this handler Chrome holds every intercepted
900 // request indefinitely and the page hangs.
901 let mut events = self
902 .page
903 .event_listener::<EventRequestPaused>()
904 .await
905 .map_err(|e| BrowserError::CdpError {
906 operation: "Fetch.requestPaused subscribe".to_string(),
907 message: e.to_string(),
908 })?;
909
910 let page = self.page.clone();
911 debug!("Resource filter active: {:?}", filter);
912 let task = tokio::spawn(async move {
913 while let Some(event) = events.next().await {
914 let request_id = event.request_id.clone();
915 if filter.should_block(event.resource_type.as_ref()) {
916 let params = FailRequestParams::new(request_id, ErrorReason::BlockedByClient);
917 let _ = page.execute(params).await;
918 } else {
919 let _ = page.execute(ContinueRequestParams::new(request_id)).await;
920 }
921 }
922 });
923
924 self.resource_filter_task = Some(task);
925 Ok(())
926 }
927
928 /// Return the current page URL (post-navigation, post-redirect).
929 ///
930 /// internally by [`save_cookies`](Self::save_cookies); no extra network
931 /// request is made. Returns an empty string if the URL is not yet set
932 ///
933 /// # Errors
934 ///
935 /// [`BrowserError::Timeout`] if it exceeds `cdp_timeout`.
936 ///
937 /// # Example
938 ///
939 /// ```no_run
940 /// use stygian_browser::{BrowserPool, BrowserConfig};
941 /// use stygian_browser::page::WaitUntil;
942 /// use std::time::Duration;
943 ///
944 /// # async fn run() -> stygian_browser::error::Result<()> {
945 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
946 /// let handle = pool.acquire().await?;
947 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
948 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
949 /// let url = page.url().await?;
950 /// println!("Final URL after redirects: {url}");
951 /// # Ok(())
952 /// # }
953 /// ```
954 pub async fn url(&self) -> Result<String> {
955 timeout(self.cdp_timeout, self.page.url())
956 .await
957 .map_err(|_| BrowserError::Timeout {
958 operation: "page.url".to_string(),
959 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
960 })?
961 .map_err(|e| BrowserError::CdpError {
962 operation: "page.url".to_string(),
963 message: e.to_string(),
964 })
965 .map(Option::unwrap_or_default)
966 }
967
968 /// Return the HTTP status code of the most recent main-frame navigation.
969 ///
970 /// The status is captured from the `Network.responseReceived` CDP event
971 /// wired up inside [`navigate`](Self::navigate), so it reflects the
972 /// *final* response after any server-side redirects.
973 ///
974 /// navigations, when [`navigate`](Self::navigate) has not yet been called,
975 /// or if the network event subscription failed.
976 ///
977 /// # Errors
978 ///
979 ///
980 /// # Example
981 ///
982 /// ```no_run
983 /// use stygian_browser::{BrowserPool, BrowserConfig};
984 /// use stygian_browser::page::WaitUntil;
985 /// use std::time::Duration;
986 ///
987 /// # async fn run() -> stygian_browser::error::Result<()> {
988 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
989 /// let handle = pool.acquire().await?;
990 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
991 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
992 /// if let Some(code) = page.status_code()? {
993 /// println!("HTTP {code}");
994 /// }
995 /// # Ok(())
996 /// # }
997 /// ```
998 pub fn status_code(&self) -> Result<Option<u16>> {
999 let code = self.last_status_code.load(Ordering::Acquire);
1000 Ok(if code == 0 { None } else { Some(code) })
1001 }
1002
1003 /// Return the page's `<title>` text.
1004 ///
1005 /// # Errors
1006 ///
1007 pub async fn title(&self) -> Result<String> {
1008 timeout(self.cdp_timeout, self.page.get_title())
1009 .await
1010 .map_err(|_| BrowserError::Timeout {
1011 operation: "get_title".to_string(),
1012 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1013 })?
1014 .map_err(|e| BrowserError::ScriptExecutionFailed {
1015 script: "document.title".to_string(),
1016 reason: e.to_string(),
1017 })
1018 .map(Option::unwrap_or_default)
1019 }
1020
1021 /// Return the page's full outer HTML.
1022 ///
1023 /// # Errors
1024 ///
1025 pub async fn content(&self) -> Result<String> {
1026 timeout(self.cdp_timeout, self.page.content())
1027 .await
1028 .map_err(|_| BrowserError::Timeout {
1029 operation: "page.content".to_string(),
1030 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1031 })?
1032 .map_err(|e| BrowserError::ScriptExecutionFailed {
1033 script: "document.documentElement.outerHTML".to_string(),
1034 reason: e.to_string(),
1035 })
1036 }
1037
1038 /// lightweight [`NodeHandle`]s backed by CDP `RemoteObjectId`s.
1039 ///
1040 /// No HTML serialisation occurs — the browser's in-memory DOM is queried
1041 /// directly over the CDP connection, eliminating the `page.content()` +
1042 /// `scraper::Html::parse_document` round-trip.
1043 ///
1044 ///
1045 /// # Errors
1046 ///
1047 /// [`BrowserError::Timeout`] if it exceeds `cdp_timeout`.
1048 ///
1049 /// # Example
1050 ///
1051 /// ```no_run
1052 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
1053 /// use std::time::Duration;
1054 ///
1055 /// # async fn run() -> stygian_browser::error::Result<()> {
1056 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1057 /// let handle = pool.acquire().await?;
1058 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
1059 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
1060 /// # let nodes = page.query_selector_all("div[data-ux]").await?;
1061 /// # for node in &nodes {
1062 /// let ux_type = node.attr("data-ux").await?;
1063 /// let text = node.text_content().await?;
1064 /// println!("{ux_type:?}: {text}");
1065 /// # }
1066 /// # Ok(())
1067 /// # }
1068 /// ```
1069 pub async fn query_selector_all(&self, selector: &str) -> Result<Vec<NodeHandle>> {
1070 let elements = timeout(self.cdp_timeout, self.page.find_elements(selector))
1071 .await
1072 .map_err(|_| BrowserError::Timeout {
1073 operation: "PageHandle::query_selector_all".to_string(),
1074 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1075 })?
1076 .map_err(|e| BrowserError::CdpError {
1077 operation: "PageHandle::query_selector_all".to_string(),
1078 message: e.to_string(),
1079 })?;
1080
1081 let selector_arc: Arc<str> = Arc::from(selector);
1082 Ok(elements
1083 .into_iter()
1084 .map(|el| NodeHandle {
1085 element: el,
1086 selector: selector_arc.clone(),
1087 cdp_timeout: self.cdp_timeout,
1088 page: self.page.clone(),
1089 })
1090 .collect())
1091 }
1092
1093 /// Evaluate arbitrary JavaScript and return the result as `T`.
1094 ///
1095 /// # Errors
1096 ///
1097 /// deserialization error.
1098 pub async fn eval<T: serde::de::DeserializeOwned>(&self, script: &str) -> Result<T> {
1099 let script_owned = script.to_string();
1100 timeout(self.cdp_timeout, self.page.evaluate(script))
1101 .await
1102 .map_err(|_| BrowserError::Timeout {
1103 operation: "page.evaluate".to_string(),
1104 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1105 })?
1106 .map_err(|e| BrowserError::ScriptExecutionFailed {
1107 script: script_owned.clone(),
1108 reason: e.to_string(),
1109 })?
1110 .into_value::<T>()
1111 .map_err(|e| BrowserError::ScriptExecutionFailed {
1112 script: script_owned,
1113 reason: e.to_string(),
1114 })
1115 }
1116
1117 ///
1118 /// # Errors
1119 ///
1120 pub async fn save_cookies(
1121 &self,
1122 ) -> Result<Vec<chromiumoxide::cdp::browser_protocol::network::Cookie>> {
1123 use chromiumoxide::cdp::browser_protocol::network::GetCookiesParams;
1124
1125 let url = self
1126 .page
1127 .url()
1128 .await
1129 .map_err(|e| BrowserError::CdpError {
1130 operation: "page.url".to_string(),
1131 message: e.to_string(),
1132 })?
1133 .unwrap_or_default();
1134
1135 timeout(
1136 self.cdp_timeout,
1137 self.page
1138 .execute(GetCookiesParams::builder().urls(vec![url]).build()),
1139 )
1140 .await
1141 .map_err(|_| BrowserError::Timeout {
1142 operation: "Network.getCookies".to_string(),
1143 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1144 })?
1145 .map_err(|e| BrowserError::CdpError {
1146 operation: "Network.getCookies".to_string(),
1147 message: e.to_string(),
1148 })
1149 .map(|r| r.cookies.clone())
1150 }
1151
1152 ///
1153 /// [`SessionSnapshot`][crate::session::SessionSnapshot] and without
1154 /// requiring a direct `chromiumoxide` dependency in calling code.
1155 ///
1156 /// Individual cookie failures are logged as warnings and do not abort the
1157 /// remaining cookies.
1158 ///
1159 /// # Errors
1160 ///
1161 /// call exceeds `cdp_timeout`.
1162 ///
1163 /// # Example
1164 ///
1165 /// ```no_run
1166 /// use stygian_browser::{BrowserPool, BrowserConfig};
1167 /// use stygian_browser::session::SessionCookie;
1168 /// use std::time::Duration;
1169 ///
1170 /// # async fn run() -> stygian_browser::error::Result<()> {
1171 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1172 /// let handle = pool.acquire().await?;
1173 /// let page = handle.browser().expect("valid browser").new_page().await?;
1174 /// let cookies = vec![SessionCookie {
1175 /// name: "session".to_string(),
1176 /// value: "abc123".to_string(),
1177 /// domain: ".example.com".to_string(),
1178 /// path: "/".to_string(),
1179 /// expires: -1.0,
1180 /// http_only: true,
1181 /// secure: true,
1182 /// same_site: "Lax".to_string(),
1183 /// }];
1184 /// page.inject_cookies(&cookies).await?;
1185 /// # Ok(())
1186 /// # }
1187 /// ```
1188 pub async fn inject_cookies(&self, cookies: &[crate::session::SessionCookie]) -> Result<()> {
1189 use chromiumoxide::cdp::browser_protocol::network::SetCookieParams;
1190
1191 for cookie in cookies {
1192 let params = match SetCookieParams::builder()
1193 .name(cookie.name.clone())
1194 .value(cookie.value.clone())
1195 .domain(cookie.domain.clone())
1196 .path(cookie.path.clone())
1197 .http_only(cookie.http_only)
1198 .secure(cookie.secure)
1199 .build()
1200 {
1201 Ok(p) => p,
1202 Err(e) => {
1203 warn!(cookie = %cookie.name, error = %e, "Failed to build cookie params");
1204 continue;
1205 }
1206 };
1207
1208 match timeout(self.cdp_timeout, self.page.execute(params)).await {
1209 Err(_) => {
1210 warn!(
1211 cookie = %cookie.name,
1212 timeout_ms = self.cdp_timeout.as_millis(),
1213 "Timed out injecting cookie"
1214 );
1215 }
1216 Ok(Err(e)) => {
1217 warn!(cookie = %cookie.name, error = %e, "Failed to inject cookie");
1218 }
1219 Ok(Ok(_)) => {}
1220 }
1221 }
1222
1223 debug!(count = cookies.len(), "Cookies injected");
1224 Ok(())
1225 }
1226
1227 /// Capture a screenshot of the current page as PNG bytes.
1228 ///
1229 /// them in-memory.
1230 ///
1231 /// # Errors
1232 ///
1233 /// command fails, or [`BrowserError::Timeout`] if it exceeds
1234 /// `cdp_timeout`.
1235 ///
1236 /// # Example
1237 ///
1238 /// ```no_run
1239 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
1240 /// use std::{time::Duration, fs};
1241 ///
1242 /// # async fn run() -> stygian_browser::error::Result<()> {
1243 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1244 /// let handle = pool.acquire().await?;
1245 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
1246 /// let png = page.screenshot().await?;
1247 /// fs::write("screenshot.png", &png).unwrap();
1248 /// # Ok(())
1249 /// # }
1250 /// ```
1251 pub async fn screenshot(&self) -> Result<Vec<u8>> {
1252 use chromiumoxide::page::ScreenshotParams;
1253
1254 let params = ScreenshotParams::builder().full_page(true).build();
1255
1256 timeout(self.cdp_timeout, self.page.screenshot(params))
1257 .await
1258 .map_err(|_| BrowserError::Timeout {
1259 operation: "Page.captureScreenshot".to_string(),
1260 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1261 })?
1262 .map_err(|e| BrowserError::CdpError {
1263 operation: "Page.captureScreenshot".to_string(),
1264 message: e.to_string(),
1265 })
1266 }
1267
1268 /// Borrow the underlying chromiumoxide [`Page`].
1269 pub const fn inner(&self) -> &Page {
1270 &self.page
1271 }
1272
1273 /// Close this page (tab).
1274 ///
1275 pub async fn close(self) -> Result<()> {
1276 timeout(Duration::from_secs(5), self.page.clone().close())
1277 .await
1278 .map_err(|_| BrowserError::Timeout {
1279 operation: "page.close".to_string(),
1280 duration_ms: 5000,
1281 })?
1282 .map_err(|e| BrowserError::CdpError {
1283 operation: "page.close".to_string(),
1284 message: e.to_string(),
1285 })
1286 }
1287}
1288
1289// ─── Stealth diagnostics ──────────────────────────────────────────────────────
1290
1291#[cfg(feature = "stealth")]
1292impl PageHandle {
1293 /// Run all built-in stealth detection checks against the current page.
1294 ///
1295 /// Iterates [`crate::diagnostic::all_checks`], evaluates each check's
1296 /// JavaScript via CDP `Runtime.evaluate`, and returns an aggregate
1297 /// [`crate::diagnostic::DiagnosticReport`].
1298 ///
1299 /// recorded as failing checks and do **not** abort the whole run.
1300 ///
1301 /// # Errors
1302 ///
1303 /// Individual check failures are captured in the report.
1304 ///
1305 /// # Example
1306 ///
1307 /// ```no_run
1308 /// # async fn run() -> stygian_browser::error::Result<()> {
1309 /// use stygian_browser::{BrowserPool, BrowserConfig};
1310 /// use stygian_browser::page::WaitUntil;
1311 /// use std::time::Duration;
1312 ///
1313 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1314 /// let handle = pool.acquire().await?;
1315 /// let browser = handle.browser().expect("valid browser");
1316 /// let mut page = browser.new_page().await?;
1317 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(10)).await?;
1318 ///
1319 /// let report = page.verify_stealth().await?;
1320 /// println!("Stealth: {}/{} checks passed", report.passed_count, report.checks.len());
1321 /// # for failure in report.failures() {
1322 /// eprintln!(" FAIL {}: {}", failure.description, failure.details);
1323 /// # }
1324 /// # Ok(())
1325 /// # }
1326 /// ```
1327 pub async fn verify_stealth(&self) -> Result<crate::diagnostic::DiagnosticReport> {
1328 use crate::diagnostic::{CheckResult, DiagnosticReport, all_checks};
1329
1330 let mut results: Vec<CheckResult> = Vec::new();
1331
1332 for check in all_checks() {
1333 let result = match self.eval::<String>(check.script).await {
1334 Ok(json) => check.parse_output(&json),
1335 Err(e) => {
1336 tracing::warn!(
1337 check = ?check.id,
1338 error = %e,
1339 "stealth check script failed during evaluation"
1340 );
1341 CheckResult {
1342 id: check.id,
1343 description: check.description.to_string(),
1344 passed: false,
1345 details: format!("script error: {e}"),
1346 }
1347 }
1348 };
1349 tracing::debug!(
1350 check = ?result.id,
1351 passed = result.passed,
1352 details = %result.details,
1353 "stealth check result"
1354 );
1355 results.push(result);
1356 }
1357
1358 Ok(DiagnosticReport::new(results))
1359 }
1360
1361 /// Run stealth checks and attach transport diagnostics (JA3/JA4/HTTP3).
1362 ///
1363 pub async fn verify_stealth_with_transport(
1364 &self,
1365 observed: Option<crate::diagnostic::TransportObservations>,
1366 ) -> Result<crate::diagnostic::DiagnosticReport> {
1367 let report = self.verify_stealth().await?;
1368
1369 let user_agent = match self.eval::<String>("navigator.userAgent").await {
1370 Ok(ua) => ua,
1371 Err(e) => {
1372 tracing::warn!(error = %e, "failed to read navigator.userAgent for transport diagnostics");
1373 String::new()
1374 }
1375 };
1376
1377 let transport = crate::diagnostic::TransportDiagnostic::from_user_agent_and_observations(
1378 &user_agent,
1379 observed.as_ref(),
1380 );
1381
1382 Ok(report.with_transport(transport))
1383 }
1384}
1385
1386// ─── extract feature ─────────────────────────────────────────────────────────
1387
1388#[cfg(feature = "extract")]
1389impl PageHandle {
1390 ///
1391 ///
1392 /// All per-node extractions are driven concurrently via
1393 /// [`futures::future::try_join_all`].
1394 ///
1395 /// # Errors
1396 ///
1397 /// fails, or [`BrowserError::ExtractionFailed`] if any field extraction
1398 /// fails.
1399 ///
1400 /// # Example
1401 ///
1402 /// ```ignore
1403 /// use stygian_browser::extract::Extract;
1404 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
1405 /// use std::time::Duration;
1406 ///
1407 /// #[derive(Extract)]
1408 /// struct Link {
1409 /// href: Option<String>,
1410 /// }
1411 ///
1412 /// # async fn run() -> stygian_browser::error::Result<()> {
1413 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1414 /// let handle = pool.acquire().await?;
1415 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
1416 /// page.navigate(
1417 /// "https://example.com",
1418 /// WaitUntil::DomContentLoaded,
1419 /// Duration::from_secs(30),
1420 /// ).await?;
1421 /// let links: Vec<Link> = page.extract_all::<Link>("nav li").await?;
1422 /// # Ok(())
1423 /// # }
1424 /// ```
1425 pub async fn extract_all<T>(&self, selector: &str) -> Result<Vec<T>>
1426 where
1427 T: crate::extract::Extractable,
1428 {
1429 use futures::future::try_join_all;
1430
1431 let nodes = self.query_selector_all(selector).await?;
1432 try_join_all(nodes.iter().map(|n| T::extract_from(n)))
1433 .await
1434 .map_err(BrowserError::ExtractionFailed)
1435 }
1436}
1437
1438// ─── similarity feature ──────────────────────────────────────────────────────
1439
1440#[cfg(feature = "similarity")]
1441impl NodeHandle {
1442 /// node.
1443 ///
1444 /// Issues a single `Runtime.callFunctionOn` JS eval that extracts the tag,
1445 /// class list, attribute names, and body-depth in one round-trip.
1446 ///
1447 /// # Errors
1448 ///
1449 /// invalidated, or [`BrowserError::ScriptExecutionFailed`] if the script
1450 /// produces unexpected output.
1451 pub async fn fingerprint(&self) -> Result<crate::similarity::ElementFingerprint> {
1452 const JS: &str = r"function() {
1453 var el = this;
1454 var tag = el.tagName.toLowerCase();
1455 var classes = Array.prototype.slice.call(el.classList).sort();
1456 var attrNames = Array.prototype.slice.call(el.attributes)
1457 .map(function(a) { return a.name; })
1458 .filter(function(n) { return n !== 'class' && n !== 'id'; })
1459 .sort();
1460 var depth = 0;
1461 var n = el.parentElement;
1462 while (n && n.tagName.toLowerCase() !== 'body') { depth++; n = n.parentElement; }
1463 return JSON.stringify({ tag: tag, classes: classes, attrNames: attrNames, depth: depth });
1464}";
1465
1466 let returns = tokio::time::timeout(self.cdp_timeout, self.element.call_js_fn(JS, true))
1467 .await
1468 .map_err(|_| BrowserError::Timeout {
1469 operation: "NodeHandle::fingerprint".to_string(),
1470 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1471 })?
1472 .map_err(|e| self.cdp_err_or_stale(&e, "fingerprint"))?;
1473
1474 let json_str = returns
1475 .result
1476 .value
1477 .as_ref()
1478 .and_then(|v| v.as_str())
1479 .ok_or_else(|| BrowserError::ScriptExecutionFailed {
1480 script: "NodeHandle::fingerprint".to_string(),
1481 reason: "CDP returned no string value from fingerprint script".to_string(),
1482 })?;
1483
1484 serde_json::from_str::<crate::similarity::ElementFingerprint>(json_str).map_err(|e| {
1485 BrowserError::ScriptExecutionFailed {
1486 script: "NodeHandle::fingerprint".to_string(),
1487 reason: format!("failed to deserialise fingerprint JSON: {e}"),
1488 }
1489 })
1490 }
1491}
1492
1493#[cfg(feature = "similarity")]
1494impl PageHandle {
1495 /// `reference`, scored by [`crate::similarity::SimilarityConfig`].
1496 ///
1497 /// [`NodeHandle::fingerprint`]), then fingerprints every candidate returned
1498 /// [`crate::similarity::jaccard_weighted`] score exceeds
1499 /// `config.threshold`. Results are ordered by score descending.
1500 ///
1501 /// # Example
1502 ///
1503 /// ```no_run
1504 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
1505 /// use stygian_browser::similarity::SimilarityConfig;
1506 /// use std::time::Duration;
1507 ///
1508 /// # async fn run() -> stygian_browser::error::Result<()> {
1509 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1510 /// let handle = pool.acquire().await?;
1511 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
1512 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
1513 ///
1514 /// # let nodes = page.query_selector_all("h1").await?;
1515 /// # let reference = nodes.into_iter().next().ok_or(stygian_browser::error::BrowserError::StaleNode { selector: "h1".to_string() })?;
1516 /// let similar = page.find_similar(&reference, SimilarityConfig::default()).await?;
1517 /// # for m in &similar {
1518 /// println!("score={:.2}", m.score);
1519 /// # }
1520 /// # Ok(())
1521 /// # }
1522 /// ```
1523 ///
1524 /// # Errors
1525 ///
1526 /// [`BrowserError::ScriptExecutionFailed`] if a scoring script fails.
1527 pub async fn find_similar(
1528 &self,
1529 reference: &NodeHandle,
1530 config: crate::similarity::SimilarityConfig,
1531 ) -> Result<Vec<crate::similarity::SimilarMatch>> {
1532 use crate::similarity::{SimilarMatch, jaccard_weighted};
1533
1534 let ref_fp = reference.fingerprint().await?;
1535 let candidates = self.query_selector_all("*").await?;
1536
1537 let mut matches: Vec<SimilarMatch> = Vec::new();
1538 for node in candidates {
1539 if let Ok(cand_fp) = node.fingerprint().await {
1540 let score = jaccard_weighted(&ref_fp, &cand_fp);
1541 if score >= config.threshold {
1542 matches.push(SimilarMatch { node, score });
1543 }
1544 }
1545 // Stale / detached nodes are silently skipped.
1546 }
1547
1548 matches.sort_by(|a, b| {
1549 b.score
1550 .partial_cmp(&a.score)
1551 .unwrap_or(std::cmp::Ordering::Equal)
1552 });
1553
1554 if config.max_results > 0 {
1555 matches.truncate(config.max_results);
1556 }
1557
1558 Ok(matches)
1559 }
1560}
1561
1562impl Drop for PageHandle {
1563 fn drop(&mut self) {
1564 warn!("PageHandle dropped without explicit close(); spawning cleanup task");
1565 // chromiumoxide Page does not implement close on Drop, so we spawn
1566 // swap it out. We clone the Page handle (it's Arc-backed internally).
1567 let page = self.page.clone();
1568 tokio::spawn(async move {
1569 let _ = page.close().await;
1570 });
1571 }
1572}
1573
1574// ─── Tests ────────────────────────────────────────────────────────────────────
1575
1576#[cfg(test)]
1577mod tests {
1578 use super::*;
1579
1580 #[test]
1581 fn resource_filter_block_media_blocks_image() {
1582 let filter = ResourceFilter::block_media();
1583 assert!(filter.should_block("Image"));
1584 assert!(filter.should_block("Font"));
1585 assert!(filter.should_block("Stylesheet"));
1586 assert!(filter.should_block("Media"));
1587 assert!(!filter.should_block("Script"));
1588 assert!(!filter.should_block("XHR"));
1589 }
1590
1591 #[test]
1592 fn resource_filter_case_insensitive() {
1593 let filter = ResourceFilter::block_images_and_fonts();
1594 assert!(filter.should_block("image")); // lowercase
1595 assert!(filter.should_block("IMAGE")); // uppercase
1596 assert!(!filter.should_block("Stylesheet"));
1597 }
1598
1599 #[test]
1600 fn resource_filter_builder_chain() {
1601 let filter = ResourceFilter::default()
1602 .block(ResourceType::Image)
1603 .block(ResourceType::Font);
1604 assert!(filter.should_block("Image"));
1605 assert!(filter.should_block("Font"));
1606 assert!(!filter.should_block("Stylesheet"));
1607 }
1608
1609 #[test]
1610 fn resource_filter_dedup_block() {
1611 let filter = ResourceFilter::default()
1612 .block(ResourceType::Image)
1613 .block(ResourceType::Image); // duplicate
1614 assert_eq!(filter.blocked.len(), 1);
1615 }
1616
1617 #[test]
1618 fn resource_filter_is_empty_when_default() {
1619 assert!(ResourceFilter::default().is_empty());
1620 assert!(!ResourceFilter::block_media().is_empty());
1621 }
1622
1623 #[test]
1624 fn wait_until_selector_stores_string() {
1625 let w = WaitUntil::Selector("#foo".to_string());
1626 assert!(matches!(w, WaitUntil::Selector(ref s) if s == "#foo"));
1627 }
1628
1629 #[test]
1630 fn resource_type_cdp_str() {
1631 assert_eq!(ResourceType::Image.as_cdp_str(), "Image");
1632 assert_eq!(ResourceType::Font.as_cdp_str(), "Font");
1633 assert_eq!(ResourceType::Stylesheet.as_cdp_str(), "Stylesheet");
1634 assert_eq!(ResourceType::Media.as_cdp_str(), "Media");
1635 }
1636
1637 #[test]
1638 fn page_handle_is_send_sync() {
1639 fn assert_send<T: Send>() {}
1640 fn assert_sync<T: Sync>() {}
1641 assert_send::<PageHandle>();
1642 assert_sync::<PageHandle>();
1643 }
1644
1645 /// `Option<u16>` are pure-logic invariants testable without a live browser.
1646 #[test]
1647 fn status_code_sentinel_zero_maps_to_none() {
1648 use std::sync::atomic::{AtomicU16, Ordering};
1649 let atom = AtomicU16::new(0);
1650 let code = atom.load(Ordering::Acquire);
1651 assert_eq!(if code == 0 { None } else { Some(code) }, None::<u16>);
1652 }
1653
1654 #[test]
1655 fn status_code_non_zero_maps_to_some() {
1656 use std::sync::atomic::{AtomicU16, Ordering};
1657 for &expected in &[200u16, 301, 404, 503] {
1658 let atom = AtomicU16::new(expected);
1659 let code = atom.load(Ordering::Acquire);
1660 assert_eq!(if code == 0 { None } else { Some(code) }, Some(expected));
1661 }
1662 }
1663
1664 // ── NodeHandle pure-logic tests ───────────────────────────────────────────
1665
1666 /// `attr_map` relies on `chunks_exact(2)` — verify the pairing logic is
1667 /// correct without a live browser by exercising it directly.
1668 #[test]
1669 fn attr_map_chunking_pairs_correctly() {
1670 let flat = [
1671 "id".to_string(),
1672 "main".to_string(),
1673 "data-ux".to_string(),
1674 "Section".to_string(),
1675 "class".to_string(),
1676 "container".to_string(),
1677 ];
1678 let mut map = std::collections::HashMap::with_capacity(flat.len() / 2);
1679 for pair in flat.chunks_exact(2) {
1680 if let [name, value] = pair {
1681 map.insert(name.clone(), value.clone());
1682 }
1683 }
1684 assert_eq!(map.get("id").map(String::as_str), Some("main"));
1685 assert_eq!(map.get("data-ux").map(String::as_str), Some("Section"));
1686 assert_eq!(map.get("class").map(String::as_str), Some("container"));
1687 assert_eq!(map.len(), 3);
1688 }
1689
1690 /// gracefully — the trailing element is silently ignored.
1691 #[test]
1692 fn attr_map_chunking_ignores_odd_trailing() {
1693 let flat = ["orphan".to_string()]; // no value
1694 let mut map = std::collections::HashMap::new();
1695 for pair in flat.chunks_exact(2) {
1696 if let [name, value] = pair {
1697 map.insert(name.clone(), value.clone());
1698 }
1699 }
1700 assert!(map.is_empty());
1701 }
1702
1703 /// Empty flat list → empty map.
1704 #[test]
1705 fn attr_map_chunking_empty_input() {
1706 let flat: Vec<String> = vec![];
1707 let map: std::collections::HashMap<String, String> = flat
1708 .chunks_exact(2)
1709 .filter_map(|pair| {
1710 if let [name, value] = pair {
1711 Some((name.clone(), value.clone()))
1712 } else {
1713 None
1714 }
1715 })
1716 .collect();
1717 assert!(map.is_empty());
1718 }
1719
1720 #[test]
1721 fn ancestors_json_parse_round_trip() -> std::result::Result<(), serde_json::Error> {
1722 let json = r#"["p","article","body","html"]"#;
1723 let result: Vec<String> = serde_json::from_str(json)?;
1724 assert_eq!(result, ["p", "article", "body", "html"]);
1725 Ok(())
1726 }
1727
1728 #[test]
1729 fn ancestors_json_parse_empty() -> std::result::Result<(), serde_json::Error> {
1730 let json = "[]";
1731 let result: Vec<String> = serde_json::from_str(json)?;
1732 assert!(result.is_empty());
1733 Ok(())
1734 }
1735
1736 /// `"div::parent"`) must surface that suffix in its `Display` output so
1737 /// callers can locate the failed traversal in logs.
1738 #[test]
1739 fn traversal_selector_suffix_in_stale_error() {
1740 let e = crate::error::BrowserError::StaleNode {
1741 selector: "div::parent".to_string(),
1742 };
1743 let msg = e.to_string();
1744 assert!(
1745 msg.contains("div::parent"),
1746 "StaleNode display must include the full selector; got: {msg}"
1747 );
1748 }
1749
1750 #[test]
1751 fn traversal_next_suffix_in_stale_error() {
1752 let e = crate::error::BrowserError::StaleNode {
1753 selector: "li.price::next".to_string(),
1754 };
1755 assert!(e.to_string().contains("li.price::next"));
1756 }
1757
1758 #[test]
1759 fn traversal_prev_suffix_in_stale_error() {
1760 let e = crate::error::BrowserError::StaleNode {
1761 selector: "td.label::prev".to_string(),
1762 };
1763 assert!(e.to_string().contains("td.label::prev"));
1764 }
1765}