stygian_browser/page.rs
1//!
2//! ## Resource blocking
3//!
4//! ## Wait strategies
5//!
6//! [`PageHandle`] exposes three wait strategies via [`WaitUntil`]:
7//! - `DomContentLoaded` — fires when the HTML is parsed
8//!
9//! # Example
10//!
11//! ```no_run
12//! use stygian_browser::{BrowserPool, BrowserConfig};
13//! use stygian_browser::page::{ResourceFilter, WaitUntil};
14//! use std::time::Duration;
15//!
16//! # async fn run() -> stygian_browser::error::Result<()> {
17//! let pool = BrowserPool::new(BrowserConfig::default()).await?;
18//! let handle = pool.acquire().await?;
19//!
20//! let mut page = handle.browser().expect("valid browser").new_page().await?;
21//! page.set_resource_filter(ResourceFilter::block_media()).await?;
22//! page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
23//! let title = page.title().await?;
24//! println!("title: {title}");
25//! handle.release().await;
26//! # Ok(())
27//! # }
28//! ```
29
30use std::collections::HashMap;
31use std::sync::{
32 Arc,
33 atomic::{AtomicU16, Ordering},
34};
35use std::time::Duration;
36
37use chromiumoxide::Page;
38use serde::{Deserialize, Serialize};
39use tokio::time::timeout;
40use tracing::{debug, warn};
41
42use crate::error::{BrowserError, Result};
43
44// ─── ResourceType ─────────────────────────────────────────────────────────────
45
46/// CDP resource types that can be intercepted.
47#[derive(Debug, Clone, PartialEq, Eq)]
48pub enum ResourceType {
49 /// `<img>`, `<picture>`, background images
50 Image,
51 /// Web fonts loaded via CSS `@font-face`
52 Font,
53 /// External CSS stylesheets
54 Stylesheet,
55 /// Media files (audio/video)
56 Media,
57}
58
59impl ResourceType {
60 pub const fn as_cdp_str(&self) -> &'static str {
61 match self {
62 Self::Image => "Image",
63 Self::Font => "Font",
64 Self::Stylesheet => "Stylesheet",
65 Self::Media => "Media",
66 }
67 }
68}
69
70// ─── ResourceFilter ───────────────────────────────────────────────────────────
71
72///
73/// # Example
74///
75/// ```
76/// use stygian_browser::page::ResourceFilter;
77/// let filter = ResourceFilter::block_media();
78/// assert!(filter.should_block("Image"));
79/// ```
80#[derive(Debug, Clone, Default)]
81pub struct ResourceFilter {
82 blocked: Vec<ResourceType>,
83}
84
85impl ResourceFilter {
86 /// Block all media resources (images, fonts, CSS, audio/video).
87 pub fn block_media() -> Self {
88 Self {
89 blocked: vec![
90 ResourceType::Image,
91 ResourceType::Font,
92 ResourceType::Stylesheet,
93 ResourceType::Media,
94 ],
95 }
96 }
97
98 pub fn block_images_and_fonts() -> Self {
99 Self {
100 blocked: vec![ResourceType::Image, ResourceType::Font],
101 }
102 }
103
104 #[must_use]
105 pub fn block(mut self, resource: ResourceType) -> Self {
106 if !self.blocked.contains(&resource) {
107 self.blocked.push(resource);
108 }
109 self
110 }
111
112 pub fn should_block(&self, cdp_type: &str) -> bool {
113 self.blocked
114 .iter()
115 .any(|r| r.as_cdp_str().eq_ignore_ascii_case(cdp_type))
116 }
117
118 pub const fn is_empty(&self) -> bool {
119 self.blocked.is_empty()
120 }
121}
122
123// ─── WaitUntil ────────────────────────────────────────────────────────────────
124
125///
126/// # Example
127///
128/// ```
129/// use stygian_browser::page::WaitUntil;
130/// ```
131/// Specifies what condition to wait for after a page navigation.
132#[derive(Debug, Clone)]
133pub enum WaitUntil {
134 /// Fires when the initial HTML is fully parsed, without waiting for
135 /// subresources such as images and stylesheets to finish loading.
136 DomContentLoaded,
137 NetworkIdle,
138 Selector(String),
139}
140
141// ─── NodeHandle ───────────────────────────────────────────────────────────────
142
143///
144/// more CDP `Runtime.callFunctionOn` calls against the held V8 remote object
145/// reference — no HTML serialisation occurs.
146///
147/// A handle becomes **stale** after page navigation or if the underlying DOM
148/// node is removed. Stale calls return [`BrowserError::StaleNode`] so callers
149/// can distinguish them from other CDP failures.
150///
151/// # Example
152///
153/// ```no_run
154/// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
155/// use std::time::Duration;
156///
157/// # async fn run() -> stygian_browser::error::Result<()> {
158/// let pool = BrowserPool::new(BrowserConfig::default()).await?;
159/// let handle = pool.acquire().await?;
160/// let mut page = handle.browser().expect("valid browser").new_page().await?;
161/// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
162/// # let nodes = page.query_selector_all("a").await?;
163/// # for node in &nodes {
164/// let href = node.attr("href").await?;
165/// let text = node.text_content().await?;
166/// println!("{text}: {href:?}");
167/// # }
168/// # Ok(())
169/// # }
170/// ```
171pub struct NodeHandle {
172 element: chromiumoxide::element::Element,
173 /// Shared via `Arc<str>` so all handles from a single query reuse the
174 /// same allocation rather than cloning a `String` per node.
175 selector: Arc<str>,
176 cdp_timeout: Duration,
177 /// during DOM traversal (parent / sibling navigation).
178 page: chromiumoxide::Page,
179}
180
181impl NodeHandle {
182 /// Return a single attribute value, or `None` if the attribute is absent.
183 ///
184 /// Issues one `Runtime.callFunctionOn` CDP call (`el.getAttribute(name)`).
185 ///
186 /// # Errors
187 ///
188 /// invalidated, or [`BrowserError::Timeout`] / [`BrowserError::CdpError`]
189 /// on transport-level failures.
190 pub async fn attr(&self, name: &str) -> Result<Option<String>> {
191 timeout(self.cdp_timeout, self.element.attribute(name))
192 .await
193 .map_err(|_| BrowserError::Timeout {
194 operation: "NodeHandle::attr".to_string(),
195 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
196 })?
197 .map_err(|e| self.cdp_err_or_stale(&e, "attr"))
198 }
199
200 /// Return all attributes as a `HashMap<name, value>` in a **single**
201 /// CDP round-trip.
202 ///
203 /// Uses `DOM.getAttributes` (via the chromiumoxide `attributes()` API)
204 /// which returns a flat `[name, value, name, value, …]` list from the node
205 /// description — no per-attribute calls are needed.
206 ///
207 /// # Errors
208 ///
209 /// invalidated.
210 pub async fn attr_map(&self) -> Result<HashMap<String, String>> {
211 let flat = timeout(self.cdp_timeout, self.element.attributes())
212 .await
213 .map_err(|_| BrowserError::Timeout {
214 operation: "NodeHandle::attr_map".to_string(),
215 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
216 })?
217 .map_err(|e| self.cdp_err_or_stale(&e, "attr_map"))?;
218
219 let mut map = HashMap::with_capacity(flat.len() / 2);
220 for pair in flat.chunks_exact(2) {
221 if let [name, value] = pair {
222 map.insert(name.clone(), value.clone());
223 }
224 }
225 Ok(map)
226 }
227
228 /// Return the element's `textContent` (all text inside, no markup).
229 ///
230 /// Reads the DOM `textContent` property via a single JS eval — this is the
231 /// raw text concatenation of all descendant text nodes, independent of
232 /// layout or visibility (unlike `innerText`).
233 ///
234 ///
235 /// # Errors
236 ///
237 /// invalidated.
238 pub async fn text_content(&self) -> Result<String> {
239 let returns = timeout(
240 self.cdp_timeout,
241 self.element
242 .call_js_fn(r"function() { return this.textContent ?? ''; }", true),
243 )
244 .await
245 .map_err(|_| BrowserError::Timeout {
246 operation: "NodeHandle::text_content".to_string(),
247 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
248 })?
249 .map_err(|e| self.cdp_err_or_stale(&e, "text_content"))?;
250
251 Ok(returns
252 .result
253 .value
254 .as_ref()
255 .and_then(|v| v.as_str())
256 .unwrap_or("")
257 .to_string())
258 }
259
260 /// Return the element's `innerHTML`.
261 ///
262 ///
263 /// # Errors
264 ///
265 /// invalidated.
266 pub async fn inner_html(&self) -> Result<String> {
267 timeout(self.cdp_timeout, self.element.inner_html())
268 .await
269 .map_err(|_| BrowserError::Timeout {
270 operation: "NodeHandle::inner_html".to_string(),
271 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
272 })?
273 .map_err(|e| self.cdp_err_or_stale(&e, "inner_html"))
274 .map(Option::unwrap_or_default)
275 }
276
277 /// Return the element's `outerHTML`.
278 ///
279 ///
280 /// # Errors
281 ///
282 /// invalidated.
283 pub async fn outer_html(&self) -> Result<String> {
284 timeout(self.cdp_timeout, self.element.outer_html())
285 .await
286 .map_err(|_| BrowserError::Timeout {
287 operation: "NodeHandle::outer_html".to_string(),
288 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
289 })?
290 .map_err(|e| self.cdp_err_or_stale(&e, "outer_html"))
291 .map(Option::unwrap_or_default)
292 }
293
294 ///
295 /// Executes a single `Runtime.callFunctionOn` JavaScript function that
296 /// walks `parentElement` and collects tag names — no repeated CDP calls.
297 ///
298 /// ```text
299 /// ["p", "article", "body", "html"]
300 /// ```
301 ///
302 /// # Errors
303 ///
304 /// invalidated, or [`BrowserError::ScriptExecutionFailed`] when CDP
305 pub async fn ancestors(&self) -> Result<Vec<String>> {
306 let returns = timeout(
307 self.cdp_timeout,
308 self.element.call_js_fn(
309 r"function() {
310 const a = [];
311 let n = this.parentElement;
312 while (n) { a.push(n.tagName.toLowerCase()); n = n.parentElement; }
313 return a;
314 }",
315 true,
316 ),
317 )
318 .await
319 .map_err(|_| BrowserError::Timeout {
320 operation: "NodeHandle::ancestors".to_string(),
321 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
322 })?
323 .map_err(|e| self.cdp_err_or_stale(&e, "ancestors"))?;
324
325 // With returnByValue=true and an array return, CDP delivers the value
326 // as a JSON array directly — no JSON.stringify/re-parse needed.
327 // A missing or wrong-type value indicates an unexpected CDP failure.
328 let arr = returns
329 .result
330 .value
331 .as_ref()
332 .and_then(|v| v.as_array())
333 .ok_or_else(|| BrowserError::ScriptExecutionFailed {
334 script: "NodeHandle::ancestors".to_string(),
335 reason: "CDP returned no value or a non-array value for ancestors()".to_string(),
336 })?;
337
338 arr.iter()
339 .map(|v| {
340 v.as_str().map(ToString::to_string).ok_or_else(|| {
341 BrowserError::ScriptExecutionFailed {
342 script: "NodeHandle::ancestors".to_string(),
343 reason: format!("ancestor entry is not a string: {v}"),
344 }
345 })
346 })
347 .collect()
348 }
349
350 ///
351 ///
352 ///
353 /// # Errors
354 ///
355 /// invalidated, or [`BrowserError::CdpError`] on transport failure.
356 pub async fn children_matching(&self, selector: &str) -> Result<Vec<Self>> {
357 let elements = timeout(self.cdp_timeout, self.element.find_elements(selector))
358 .await
359 .map_err(|_| BrowserError::Timeout {
360 operation: "NodeHandle::children_matching".to_string(),
361 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
362 })?
363 .map_err(|e| self.cdp_err_or_stale(&e, "children_matching"))?;
364
365 let selector_arc: Arc<str> = Arc::from(selector);
366 Ok(elements
367 .into_iter()
368 .map(|el| Self {
369 element: el,
370 selector: selector_arc.clone(),
371 cdp_timeout: self.cdp_timeout,
372 page: self.page.clone(),
373 })
374 .collect())
375 }
376
377 /// Return the immediate parent element, or `None` if this element has no
378 /// parent (i.e. it is the document root).
379 ///
380 /// Issues a single `Runtime.callFunctionOn` CDP call that temporarily tags
381 /// the parent element with a unique attribute, then resolves it via a
382 /// CSS attribute selector.
383 ///
384 /// # Errors
385 ///
386 /// Returns an error if the CDP call fails or the page handle is invalidated.
387 ///
388 /// # Example
389 ///
390 /// ```no_run
391 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
392 /// use std::time::Duration;
393 ///
394 /// # async fn run() -> stygian_browser::error::Result<()> {
395 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
396 /// let handle = pool.acquire().await?;
397 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
398 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
399 /// # let nodes = page.query_selector_all("a").await?;
400 /// if let Some(parent) = nodes[0].parent().await? {
401 /// let html = parent.outer_html().await?;
402 /// println!("parent: {}", &html[..html.len().min(80)]);
403 /// }
404 /// # Ok(())
405 /// # }
406 /// ```
407 pub async fn parent(&self) -> Result<Option<Self>> {
408 let attr = format!(
409 "data-stygian-t-{}",
410 ulid::Ulid::new().to_string().to_lowercase()
411 );
412 let js = format!(
413 "function() {{ \
414 var t = this.parentElement; \
415 if (!t) {{ return false; }} \
416 t.setAttribute('{attr}', '1'); \
417 return true; \
418 }}"
419 );
420 self.call_traversal(&js, &attr, "parent").await
421 }
422
423 /// Return the next element sibling, or `None` if this element is the last
424 /// child of its parent.
425 ///
426 /// Uses `nextElementSibling` (skips text/comment nodes).
427 ///
428 /// # Errors
429 ///
430 /// invalidated.
431 ///
432 /// # Example
433 ///
434 /// ```no_run
435 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
436 /// use std::time::Duration;
437 ///
438 /// # async fn run() -> stygian_browser::error::Result<()> {
439 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
440 /// let handle = pool.acquire().await?;
441 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
442 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
443 /// # let nodes = page.query_selector_all("a").await?;
444 /// if let Some(next) = nodes[0].next_sibling().await? {
445 /// println!("next sibling: {}", next.text_content().await?);
446 /// }
447 /// # Ok(())
448 /// # }
449 /// ```
450 pub async fn next_sibling(&self) -> Result<Option<Self>> {
451 let attr = format!(
452 "data-stygian-t-{}",
453 ulid::Ulid::new().to_string().to_lowercase()
454 );
455 let js = format!(
456 "function() {{ \
457 var t = this.nextElementSibling; \
458 if (!t) {{ return false; }} \
459 t.setAttribute('{attr}', '1'); \
460 return true; \
461 }}"
462 );
463 self.call_traversal(&js, &attr, "next").await
464 }
465
466 /// Return the previous element sibling, or `None` if this element is the
467 /// first child of its parent.
468 ///
469 /// Uses `previousElementSibling` (skips text/comment nodes).
470 ///
471 /// # Errors
472 ///
473 /// invalidated.
474 ///
475 /// # Example
476 ///
477 /// ```no_run
478 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
479 /// use std::time::Duration;
480 ///
481 /// # async fn run() -> stygian_browser::error::Result<()> {
482 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
483 /// let handle = pool.acquire().await?;
484 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
485 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
486 /// # let nodes = page.query_selector_all("a").await?;
487 /// if let Some(prev) = nodes[1].previous_sibling().await? {
488 /// println!("prev sibling: {}", prev.text_content().await?);
489 /// }
490 /// # Ok(())
491 /// # }
492 /// ```
493 pub async fn previous_sibling(&self) -> Result<Option<Self>> {
494 let attr = format!(
495 "data-stygian-t-{}",
496 ulid::Ulid::new().to_string().to_lowercase()
497 );
498 let js = format!(
499 "function() {{ \
500 var t = this.previousElementSibling; \
501 if (!t) {{ return false; }} \
502 t.setAttribute('{attr}', '1'); \
503 return true; \
504 }}"
505 );
506 self.call_traversal(&js, &attr, "prev").await
507 }
508
509 /// Shared traversal implementation used by [`parent`], [`next_sibling`],
510 /// and [`previous_sibling`].
511 ///
512 /// The caller provides a JS function that:
513 /// 1. Computes the traversal target (for example, the parent, next
514 /// sibling, or previous sibling) and stores it in a local variable.
515 /// 2. If the target is non-null, sets a unique attribute (`attr_name`)
516 /// on it and returns `true`.
517 /// 3. Returns `false` when the target is null (no such neighbour).
518 ///
519 /// This helper then resolves the tagged element from the document root,
520 /// removes the temporary attribute, and wraps the result in a
521 /// `NodeHandle`.
522 ///
523 /// [`parent`]: Self::parent
524 /// [`next_sibling`]: Self::next_sibling
525 /// [`previous_sibling`]: Self::previous_sibling
526 async fn call_traversal(
527 &self,
528 js_fn: &str,
529 attr_name: &str,
530 selector_suffix: &str,
531 ) -> Result<Option<Self>> {
532 // Step 1: Run the JS that tags the target element and reports null/non-null.
533 let op_tag = format!("NodeHandle::{selector_suffix}::tag");
534 let returns = timeout(self.cdp_timeout, self.element.call_js_fn(js_fn, false))
535 .await
536 .map_err(|_| BrowserError::Timeout {
537 operation: op_tag.clone(),
538 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
539 })?
540 .map_err(|e| self.cdp_err_or_stale(&e, selector_suffix))?;
541
542 // JS returns false → no such neighbour.
543 let has_target = returns
544 .result
545 .value
546 .as_ref()
547 .and_then(serde_json::Value::as_bool)
548 .unwrap_or(false);
549 if !has_target {
550 return Ok(None);
551 }
552
553 let css = format!("[{attr_name}]");
554 let op_resolve = format!("NodeHandle::{selector_suffix}::resolve");
555 let element = timeout(self.cdp_timeout, self.page.find_element(css))
556 .await
557 .map_err(|_| BrowserError::Timeout {
558 operation: op_resolve.clone(),
559 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
560 })?
561 .map_err(|e| BrowserError::CdpError {
562 operation: op_resolve,
563 message: format!("{e:?}"),
564 })?;
565
566 // is non-fatal — it leaves a harmless stale attribute in the DOM).
567 let cleanup = format!("function() {{ this.removeAttribute('{attr_name}'); }}");
568 let _ = element.call_js_fn(cleanup, false).await;
569
570 let new_selector: Arc<str> =
571 Arc::from(format!("{}::{selector_suffix}", self.selector).as_str());
572 Ok(Some(Self {
573 element,
574 selector: new_selector,
575 cdp_timeout: self.cdp_timeout,
576 page: self.page.clone(),
577 }))
578 }
579
580 /// (when the remote object reference has been invalidated) or
581 fn cdp_err_or_stale(
582 &self,
583 err: &chromiumoxide::error::CdpError,
584 operation: &str,
585 ) -> BrowserError {
586 let msg = format!("{err:?}");
587 if msg.contains("Cannot find object with id")
588 || msg.contains("context with specified id")
589 || msg.contains("Cannot find context")
590 {
591 BrowserError::StaleNode {
592 selector: self.selector.to_string(),
593 }
594 } else {
595 BrowserError::CdpError {
596 operation: operation.to_string(),
597 message: msg,
598 }
599 }
600 }
601}
602
603// ─── PageHandle ───────────────────────────────────────────────────────────────
604
605///
606///
607/// # Example
608///
609/// ```no_run
610/// use stygian_browser::{BrowserPool, BrowserConfig};
611/// use stygian_browser::page::WaitUntil;
612/// use std::time::Duration;
613///
614/// # async fn run() -> stygian_browser::error::Result<()> {
615/// let pool = BrowserPool::new(BrowserConfig::default()).await?;
616/// let handle = pool.acquire().await?;
617/// let mut page = handle.browser().expect("valid browser").new_page().await?;
618/// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
619/// let html = page.content().await?;
620/// drop(page); // closes the tab
621/// handle.release().await;
622/// # Ok(())
623/// # }
624/// ```
625pub struct PageHandle {
626 page: Page,
627 cdp_timeout: Duration,
628 /// HTTP status code of the most recent main-frame navigation, or `0` if not
629 last_status_code: Arc<AtomicU16>,
630 /// Background task processing `Fetch.requestPaused` events. Aborted and
631 /// replaced each time `set_resource_filter` is called.
632 resource_filter_task: Option<tokio::task::JoinHandle<()>>,
633}
634
635impl PageHandle {
636 /// Wrap a raw chromiumoxide [`Page`] in a handle.
637 pub(crate) fn new(page: Page, cdp_timeout: Duration) -> Self {
638 Self {
639 page,
640 cdp_timeout,
641 last_status_code: Arc::new(AtomicU16::new(0)),
642 resource_filter_task: None,
643 }
644 }
645
646 ///
647 /// # Errors
648 ///
649 /// the CDP call fails.
650 pub async fn navigate(
651 &mut self,
652 url: &str,
653 condition: WaitUntil,
654 nav_timeout: Duration,
655 ) -> Result<()> {
656 self.setup_status_capture().await;
657 timeout(
658 nav_timeout,
659 self.navigate_inner(url, condition, nav_timeout),
660 )
661 .await
662 .map_err(|_| BrowserError::NavigationFailed {
663 url: url.to_string(),
664 reason: format!("navigation timed out after {nav_timeout:?}"),
665 })?
666 }
667
668 /// Reset the last status code and wire up the `Network.responseReceived`
669 /// so that a missing network domain never blocks navigation.
670 async fn setup_status_capture(&self) {
671 use chromiumoxide::cdp::browser_protocol::network::{
672 EventResponseReceived, ResourceType as NetworkResourceType,
673 };
674 use futures::StreamExt;
675
676 // Reset so a stale code is not returned if the new navigation fails
677 self.last_status_code.store(0, Ordering::Release);
678
679 let page_for_listener = self.page.clone();
680 let status_capture = Arc::clone(&self.last_status_code);
681 match page_for_listener
682 .event_listener::<EventResponseReceived>()
683 .await
684 {
685 Ok(mut stream) => {
686 tokio::spawn(async move {
687 while let Some(event) = stream.next().await {
688 if event.r#type == NetworkResourceType::Document {
689 let code = u16::try_from(event.response.status).unwrap_or(0);
690 if code > 0 {
691 status_capture.store(code, Ordering::Release);
692 }
693 break;
694 }
695 }
696 });
697 }
698 Err(e) => warn!("status-code capture unavailable: {e}"),
699 }
700 }
701
702 /// described in issue #7.
703 async fn navigate_inner(
704 &self,
705 url: &str,
706 condition: WaitUntil,
707 nav_timeout: Duration,
708 ) -> Result<()> {
709 use chromiumoxide::cdp::browser_protocol::page::{
710 EventDomContentEventFired, EventLoadEventFired,
711 };
712 use futures::StreamExt;
713
714 let url_owned = url.to_string();
715
716 let mut dom_events = match &condition {
717 WaitUntil::DomContentLoaded => Some(
718 self.page
719 .event_listener::<EventDomContentEventFired>()
720 .await
721 .map_err(|e| BrowserError::NavigationFailed {
722 url: url_owned.clone(),
723 reason: format!("{e:?}"),
724 })?,
725 ),
726 _ => None,
727 };
728
729 let mut load_events = match &condition {
730 WaitUntil::NetworkIdle => Some(
731 self.page
732 .event_listener::<EventLoadEventFired>()
733 .await
734 .map_err(|e| BrowserError::NavigationFailed {
735 url: url_owned.clone(),
736 reason: e.to_string(),
737 })?,
738 ),
739 _ => None,
740 };
741
742 let inflight = if matches!(condition, WaitUntil::NetworkIdle) {
743 Some(self.subscribe_inflight_counter().await)
744 } else {
745 None
746 };
747
748 self.page
749 .goto(url)
750 .await
751 .map_err(|e| BrowserError::NavigationFailed {
752 url: url_owned.clone(),
753 reason: e.to_string(),
754 })?;
755
756 match &condition {
757 WaitUntil::DomContentLoaded => {
758 if let Some(ref mut events) = dom_events {
759 let _ = events.next().await;
760 }
761 }
762 WaitUntil::NetworkIdle => {
763 if let Some(ref mut events) = load_events {
764 let _ = events.next().await;
765 }
766 if let Some(ref counter) = inflight {
767 Self::wait_network_idle(counter).await;
768 }
769 }
770 WaitUntil::Selector(css) => {
771 self.wait_for_selector(css, nav_timeout).await?;
772 }
773 }
774 Ok(())
775 }
776
777 /// Spawn three detached tasks that maintain a signed in-flight request
778 /// counter via `Network.requestWillBeSent` (+1) and
779 /// `Network.loadingFinished`/`Network.loadingFailed` (−1 each).
780 async fn subscribe_inflight_counter(&self) -> Arc<std::sync::atomic::AtomicI32> {
781 use std::sync::atomic::AtomicI32;
782
783 use chromiumoxide::cdp::browser_protocol::network::{
784 EventLoadingFailed, EventLoadingFinished, EventRequestWillBeSent,
785 };
786 use futures::StreamExt;
787
788 let counter: Arc<AtomicI32> = Arc::new(AtomicI32::new(0));
789 let pairs: [(Arc<AtomicI32>, i32); 3] = [
790 (Arc::clone(&counter), 1),
791 (Arc::clone(&counter), -1),
792 (Arc::clone(&counter), -1),
793 ];
794 let [p1, p2, p3] = [self.page.clone(), self.page.clone(), self.page.clone()];
795
796 macro_rules! spawn_tracker {
797 ($page:expr, $event:ty, $c:expr, $delta:expr) => {
798 match $page.event_listener::<$event>().await {
799 Ok(mut s) => {
800 let c = $c;
801 let d = $delta;
802 tokio::spawn(async move {
803 while s.next().await.is_some() {
804 c.fetch_add(d, Ordering::Relaxed);
805 }
806 });
807 }
808 Err(e) => warn!("network-idle tracker unavailable: {e}"),
809 }
810 };
811 }
812
813 let [(c1, d1), (c2, d2), (c3, d3)] = pairs;
814 spawn_tracker!(p1, EventRequestWillBeSent, c1, d1);
815 spawn_tracker!(p2, EventLoadingFinished, c2, d2);
816 spawn_tracker!(p3, EventLoadingFailed, c3, d3);
817
818 counter
819 }
820
821 async fn wait_network_idle(counter: &Arc<std::sync::atomic::AtomicI32>) {
822 const IDLE_THRESHOLD: i32 = 2;
823 const SETTLE: Duration = Duration::from_millis(500);
824 loop {
825 if counter.load(Ordering::Relaxed) <= IDLE_THRESHOLD {
826 tokio::time::sleep(SETTLE).await;
827 if counter.load(Ordering::Relaxed) <= IDLE_THRESHOLD {
828 break;
829 }
830 } else {
831 tokio::time::sleep(Duration::from_millis(50)).await;
832 }
833 }
834 }
835
836 ///
837 /// # Errors
838 ///
839 /// within the given timeout.
840 pub async fn wait_for_selector(&self, selector: &str, wait_timeout: Duration) -> Result<()> {
841 let selector_owned = selector.to_string();
842 let poll = async {
843 loop {
844 if self.page.find_element(selector_owned.clone()).await.is_ok() {
845 return Ok(());
846 }
847 tokio::time::sleep(Duration::from_millis(100)).await;
848 }
849 };
850
851 timeout(wait_timeout, poll)
852 .await
853 .map_err(|_| BrowserError::NavigationFailed {
854 url: String::new(),
855 reason: format!("selector '{selector_owned}' not found within {wait_timeout:?}"),
856 })?
857 }
858
859 ///
860 /// Enables `Fetch` interception and spawns a background task that continues
861 /// allowed requests and fails blocked ones with `BlockedByClient`. Any
862 /// previously set filter task is cancelled first.
863 ///
864 /// # Errors
865 ///
866 pub async fn set_resource_filter(&mut self, filter: ResourceFilter) -> Result<()> {
867 use chromiumoxide::cdp::browser_protocol::fetch::{
868 ContinueRequestParams, EnableParams, EventRequestPaused, FailRequestParams,
869 RequestPattern,
870 };
871 use chromiumoxide::cdp::browser_protocol::network::ErrorReason;
872 use futures::StreamExt as _;
873
874 if filter.is_empty() {
875 return Ok(());
876 }
877
878 // Cancel any previously running filter task.
879 if let Some(task) = self.resource_filter_task.take() {
880 task.abort();
881 }
882
883 let pattern = RequestPattern::builder().url_pattern("*").build();
884 let params = EnableParams::builder()
885 .patterns(vec![pattern])
886 .handle_auth_requests(false)
887 .build();
888
889 timeout(self.cdp_timeout, self.page.execute::<EnableParams>(params))
890 .await
891 .map_err(|_| BrowserError::Timeout {
892 operation: "Fetch.enable".to_string(),
893 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
894 })?
895 .map_err(|e| BrowserError::CdpError {
896 operation: "Fetch.enable".to_string(),
897 message: e.to_string(),
898 })?;
899
900 // is never blocked. Without this handler Chrome holds every intercepted
901 // request indefinitely and the page hangs.
902 let mut events = self
903 .page
904 .event_listener::<EventRequestPaused>()
905 .await
906 .map_err(|e| BrowserError::CdpError {
907 operation: "Fetch.requestPaused subscribe".to_string(),
908 message: e.to_string(),
909 })?;
910
911 let page = self.page.clone();
912 debug!("Resource filter active: {:?}", filter);
913 let task = tokio::spawn(async move {
914 while let Some(event) = events.next().await {
915 let request_id = event.request_id.clone();
916 if filter.should_block(event.resource_type.as_ref()) {
917 let params = FailRequestParams::new(request_id, ErrorReason::BlockedByClient);
918 let _ = page.execute(params).await;
919 } else {
920 let _ = page.execute(ContinueRequestParams::new(request_id)).await;
921 }
922 }
923 });
924
925 self.resource_filter_task = Some(task);
926 Ok(())
927 }
928
929 /// Return the current page URL (post-navigation, post-redirect).
930 ///
931 /// internally by [`save_cookies`](Self::save_cookies); no extra network
932 /// request is made. Returns an empty string if the URL is not yet set
933 ///
934 /// # Errors
935 ///
936 /// [`BrowserError::Timeout`] if it exceeds `cdp_timeout`.
937 ///
938 /// # Example
939 ///
940 /// ```no_run
941 /// use stygian_browser::{BrowserPool, BrowserConfig};
942 /// use stygian_browser::page::WaitUntil;
943 /// use std::time::Duration;
944 ///
945 /// # async fn run() -> stygian_browser::error::Result<()> {
946 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
947 /// let handle = pool.acquire().await?;
948 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
949 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
950 /// let url = page.url().await?;
951 /// println!("Final URL after redirects: {url}");
952 /// # Ok(())
953 /// # }
954 /// ```
955 pub async fn url(&self) -> Result<String> {
956 timeout(self.cdp_timeout, self.page.url())
957 .await
958 .map_err(|_| BrowserError::Timeout {
959 operation: "page.url".to_string(),
960 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
961 })?
962 .map_err(|e| BrowserError::CdpError {
963 operation: "page.url".to_string(),
964 message: e.to_string(),
965 })
966 .map(Option::unwrap_or_default)
967 }
968
969 /// Return the HTTP status code of the most recent main-frame navigation.
970 ///
971 /// The status is captured from the `Network.responseReceived` CDP event
972 /// wired up inside [`navigate`](Self::navigate), so it reflects the
973 /// *final* response after any server-side redirects.
974 ///
975 /// navigations, when [`navigate`](Self::navigate) has not yet been called,
976 /// or if the network event subscription failed.
977 ///
978 /// # Errors
979 ///
980 ///
981 /// # Example
982 ///
983 /// ```no_run
984 /// use stygian_browser::{BrowserPool, BrowserConfig};
985 /// use stygian_browser::page::WaitUntil;
986 /// use std::time::Duration;
987 ///
988 /// # async fn run() -> stygian_browser::error::Result<()> {
989 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
990 /// let handle = pool.acquire().await?;
991 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
992 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
993 /// if let Some(code) = page.status_code()? {
994 /// println!("HTTP {code}");
995 /// }
996 /// # Ok(())
997 /// # }
998 /// ```
999 pub fn status_code(&self) -> Result<Option<u16>> {
1000 let code = self.last_status_code.load(Ordering::Acquire);
1001 Ok(if code == 0 { None } else { Some(code) })
1002 }
1003
1004 /// Return the page's `<title>` text.
1005 ///
1006 /// # Errors
1007 ///
1008 pub async fn title(&self) -> Result<String> {
1009 timeout(self.cdp_timeout, self.page.get_title())
1010 .await
1011 .map_err(|_| BrowserError::Timeout {
1012 operation: "get_title".to_string(),
1013 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1014 })?
1015 .map_err(|e| BrowserError::ScriptExecutionFailed {
1016 script: "document.title".to_string(),
1017 reason: e.to_string(),
1018 })
1019 .map(Option::unwrap_or_default)
1020 }
1021
1022 /// Return the page's full outer HTML.
1023 ///
1024 /// # Errors
1025 ///
1026 pub async fn content(&self) -> Result<String> {
1027 timeout(self.cdp_timeout, self.page.content())
1028 .await
1029 .map_err(|_| BrowserError::Timeout {
1030 operation: "page.content".to_string(),
1031 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1032 })?
1033 .map_err(|e| BrowserError::ScriptExecutionFailed {
1034 script: "document.documentElement.outerHTML".to_string(),
1035 reason: e.to_string(),
1036 })
1037 }
1038
1039 /// lightweight [`NodeHandle`]s backed by CDP `RemoteObjectId`s.
1040 ///
1041 /// No HTML serialisation occurs — the browser's in-memory DOM is queried
1042 /// directly over the CDP connection, eliminating the `page.content()` +
1043 /// `scraper::Html::parse_document` round-trip.
1044 ///
1045 ///
1046 /// # Errors
1047 ///
1048 /// [`BrowserError::Timeout`] if it exceeds `cdp_timeout`.
1049 ///
1050 /// # Example
1051 ///
1052 /// ```no_run
1053 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
1054 /// use std::time::Duration;
1055 ///
1056 /// # async fn run() -> stygian_browser::error::Result<()> {
1057 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1058 /// let handle = pool.acquire().await?;
1059 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
1060 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
1061 /// # let nodes = page.query_selector_all("div[data-ux]").await?;
1062 /// # for node in &nodes {
1063 /// let ux_type = node.attr("data-ux").await?;
1064 /// let text = node.text_content().await?;
1065 /// println!("{ux_type:?}: {text}");
1066 /// # }
1067 /// # Ok(())
1068 /// # }
1069 /// ```
1070 pub async fn query_selector_all(&self, selector: &str) -> Result<Vec<NodeHandle>> {
1071 let elements = timeout(self.cdp_timeout, self.page.find_elements(selector))
1072 .await
1073 .map_err(|_| BrowserError::Timeout {
1074 operation: "PageHandle::query_selector_all".to_string(),
1075 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1076 })?
1077 .map_err(|e| BrowserError::CdpError {
1078 operation: "PageHandle::query_selector_all".to_string(),
1079 message: e.to_string(),
1080 })?;
1081
1082 let selector_arc: Arc<str> = Arc::from(selector);
1083 Ok(elements
1084 .into_iter()
1085 .map(|el| NodeHandle {
1086 element: el,
1087 selector: selector_arc.clone(),
1088 cdp_timeout: self.cdp_timeout,
1089 page: self.page.clone(),
1090 })
1091 .collect())
1092 }
1093
1094 /// Evaluate arbitrary JavaScript and return the result as `T`.
1095 ///
1096 /// # Errors
1097 ///
1098 /// deserialization error.
1099 pub async fn eval<T: serde::de::DeserializeOwned>(&self, script: &str) -> Result<T> {
1100 let script_owned = script.to_string();
1101 timeout(self.cdp_timeout, self.page.evaluate(script))
1102 .await
1103 .map_err(|_| BrowserError::Timeout {
1104 operation: "page.evaluate".to_string(),
1105 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1106 })?
1107 .map_err(|e| BrowserError::ScriptExecutionFailed {
1108 script: script_owned.clone(),
1109 reason: e.to_string(),
1110 })?
1111 .into_value::<T>()
1112 .map_err(|e| BrowserError::ScriptExecutionFailed {
1113 script: script_owned,
1114 reason: e.to_string(),
1115 })
1116 }
1117
1118 ///
1119 /// # Errors
1120 ///
1121 pub async fn save_cookies(
1122 &self,
1123 ) -> Result<Vec<chromiumoxide::cdp::browser_protocol::network::Cookie>> {
1124 use chromiumoxide::cdp::browser_protocol::network::GetCookiesParams;
1125
1126 let url = self
1127 .page
1128 .url()
1129 .await
1130 .map_err(|e| BrowserError::CdpError {
1131 operation: "page.url".to_string(),
1132 message: e.to_string(),
1133 })?
1134 .unwrap_or_default();
1135
1136 timeout(
1137 self.cdp_timeout,
1138 self.page
1139 .execute(GetCookiesParams::builder().urls(vec![url]).build()),
1140 )
1141 .await
1142 .map_err(|_| BrowserError::Timeout {
1143 operation: "Network.getCookies".to_string(),
1144 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1145 })?
1146 .map_err(|e| BrowserError::CdpError {
1147 operation: "Network.getCookies".to_string(),
1148 message: e.to_string(),
1149 })
1150 .map(|r| r.cookies.clone())
1151 }
1152
1153 ///
1154 /// [`SessionSnapshot`][crate::session::SessionSnapshot] and without
1155 /// requiring a direct `chromiumoxide` dependency in calling code.
1156 ///
1157 /// Individual cookie failures are logged as warnings and do not abort the
1158 /// remaining cookies.
1159 ///
1160 /// # Errors
1161 ///
1162 /// call exceeds `cdp_timeout`.
1163 ///
1164 /// # Example
1165 ///
1166 /// ```no_run
1167 /// use stygian_browser::{BrowserPool, BrowserConfig};
1168 /// use stygian_browser::session::SessionCookie;
1169 /// use std::time::Duration;
1170 ///
1171 /// # async fn run() -> stygian_browser::error::Result<()> {
1172 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1173 /// let handle = pool.acquire().await?;
1174 /// let page = handle.browser().expect("valid browser").new_page().await?;
1175 /// let cookies = vec![SessionCookie {
1176 /// name: "session".to_string(),
1177 /// value: "abc123".to_string(),
1178 /// domain: ".example.com".to_string(),
1179 /// path: "/".to_string(),
1180 /// expires: -1.0,
1181 /// http_only: true,
1182 /// secure: true,
1183 /// same_site: "Lax".to_string(),
1184 /// }];
1185 /// page.inject_cookies(&cookies).await?;
1186 /// # Ok(())
1187 /// # }
1188 /// ```
1189 pub async fn inject_cookies(&self, cookies: &[crate::session::SessionCookie]) -> Result<()> {
1190 use chromiumoxide::cdp::browser_protocol::network::SetCookieParams;
1191
1192 for cookie in cookies {
1193 let params = match SetCookieParams::builder()
1194 .name(cookie.name.clone())
1195 .value(cookie.value.clone())
1196 .domain(cookie.domain.clone())
1197 .path(cookie.path.clone())
1198 .http_only(cookie.http_only)
1199 .secure(cookie.secure)
1200 .build()
1201 {
1202 Ok(p) => p,
1203 Err(e) => {
1204 warn!(cookie = %cookie.name, error = %e, "Failed to build cookie params");
1205 continue;
1206 }
1207 };
1208
1209 match timeout(self.cdp_timeout, self.page.execute(params)).await {
1210 Err(_) => {
1211 warn!(
1212 cookie = %cookie.name,
1213 timeout_ms = self.cdp_timeout.as_millis(),
1214 "Timed out injecting cookie"
1215 );
1216 }
1217 Ok(Err(e)) => {
1218 warn!(cookie = %cookie.name, error = %e, "Failed to inject cookie");
1219 }
1220 Ok(Ok(_)) => {}
1221 }
1222 }
1223
1224 debug!(count = cookies.len(), "Cookies injected");
1225 Ok(())
1226 }
1227
1228 /// Capture a screenshot of the current page as PNG bytes.
1229 ///
1230 /// them in-memory.
1231 ///
1232 /// # Errors
1233 ///
1234 /// command fails, or [`BrowserError::Timeout`] if it exceeds
1235 /// `cdp_timeout`.
1236 ///
1237 /// # Example
1238 ///
1239 /// ```no_run
1240 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
1241 /// use std::{time::Duration, fs};
1242 ///
1243 /// # async fn run() -> stygian_browser::error::Result<()> {
1244 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1245 /// let handle = pool.acquire().await?;
1246 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
1247 /// let png = page.screenshot().await?;
1248 /// fs::write("screenshot.png", &png).unwrap();
1249 /// # Ok(())
1250 /// # }
1251 /// ```
1252 pub async fn screenshot(&self) -> Result<Vec<u8>> {
1253 use chromiumoxide::page::ScreenshotParams;
1254
1255 let params = ScreenshotParams::builder().full_page(true).build();
1256
1257 timeout(self.cdp_timeout, self.page.screenshot(params))
1258 .await
1259 .map_err(|_| BrowserError::Timeout {
1260 operation: "Page.captureScreenshot".to_string(),
1261 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1262 })?
1263 .map_err(|e| BrowserError::CdpError {
1264 operation: "Page.captureScreenshot".to_string(),
1265 message: e.to_string(),
1266 })
1267 }
1268
1269 /// Borrow the underlying chromiumoxide [`Page`].
1270 pub const fn inner(&self) -> &Page {
1271 &self.page
1272 }
1273
1274 /// Close this page (tab).
1275 ///
1276 pub async fn close(self) -> Result<()> {
1277 timeout(Duration::from_secs(5), self.page.clone().close())
1278 .await
1279 .map_err(|_| BrowserError::Timeout {
1280 operation: "page.close".to_string(),
1281 duration_ms: 5000,
1282 })?
1283 .map_err(|e| BrowserError::CdpError {
1284 operation: "page.close".to_string(),
1285 message: e.to_string(),
1286 })
1287 }
1288}
1289
1290// ─── Stealth diagnostics ──────────────────────────────────────────────────────
1291
1292#[cfg(feature = "stealth")]
1293impl PageHandle {
1294 /// Run all built-in stealth detection checks against the current page.
1295 ///
1296 /// Iterates [`crate::diagnostic::all_checks`], evaluates each check's
1297 /// JavaScript via CDP `Runtime.evaluate`, and returns an aggregate
1298 /// [`crate::diagnostic::DiagnosticReport`].
1299 ///
1300 /// recorded as failing checks and do **not** abort the whole run.
1301 ///
1302 /// # Errors
1303 ///
1304 /// Individual check failures are captured in the report.
1305 ///
1306 /// # Example
1307 ///
1308 /// ```no_run
1309 /// # async fn run() -> stygian_browser::error::Result<()> {
1310 /// use stygian_browser::{BrowserPool, BrowserConfig};
1311 /// use stygian_browser::page::WaitUntil;
1312 /// use std::time::Duration;
1313 ///
1314 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1315 /// let handle = pool.acquire().await?;
1316 /// let browser = handle.browser().expect("valid browser");
1317 /// let mut page = browser.new_page().await?;
1318 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(10)).await?;
1319 ///
1320 /// let report = page.verify_stealth().await?;
1321 /// println!("Stealth: {}/{} checks passed", report.passed_count, report.checks.len());
1322 /// # for failure in report.failures() {
1323 /// eprintln!(" FAIL {}: {}", failure.description, failure.details);
1324 /// # }
1325 /// # Ok(())
1326 /// # }
1327 /// ```
1328 pub async fn verify_stealth(&self) -> Result<crate::diagnostic::DiagnosticReport> {
1329 use crate::diagnostic::{CheckResult, DiagnosticReport, all_checks};
1330
1331 let mut results: Vec<CheckResult> = Vec::new();
1332
1333 for check in all_checks() {
1334 let result = match self.eval::<String>(check.script).await {
1335 Ok(json) => check.parse_output(&json),
1336 Err(e) => {
1337 tracing::warn!(
1338 check = ?check.id,
1339 error = %e,
1340 "stealth check script failed during evaluation"
1341 );
1342 CheckResult {
1343 id: check.id,
1344 description: check.description.to_string(),
1345 passed: false,
1346 details: format!("script error: {e}"),
1347 }
1348 }
1349 };
1350 tracing::debug!(
1351 check = ?result.id,
1352 passed = result.passed,
1353 details = %result.details,
1354 "stealth check result"
1355 );
1356 results.push(result);
1357 }
1358
1359 Ok(DiagnosticReport::new(results))
1360 }
1361
1362 /// Run stealth checks and attach transport diagnostics (JA3/JA4/HTTP3).
1363 ///
1364 pub async fn verify_stealth_with_transport(
1365 &self,
1366 observed: Option<crate::diagnostic::TransportObservations>,
1367 ) -> Result<crate::diagnostic::DiagnosticReport> {
1368 let report = self.verify_stealth().await?;
1369
1370 let user_agent = match self.eval::<String>("navigator.userAgent").await {
1371 Ok(ua) => ua,
1372 Err(e) => {
1373 tracing::warn!(error = %e, "failed to read navigator.userAgent for transport diagnostics");
1374 String::new()
1375 }
1376 };
1377
1378 let transport = crate::diagnostic::TransportDiagnostic::from_user_agent_and_observations(
1379 &user_agent,
1380 observed.as_ref(),
1381 );
1382
1383 Ok(report.with_transport(transport))
1384 }
1385}
1386
1387// ─── extract feature ─────────────────────────────────────────────────────────
1388
1389#[cfg(feature = "extract")]
1390impl PageHandle {
1391 ///
1392 ///
1393 /// All per-node extractions are driven concurrently via
1394 /// [`futures::future::try_join_all`].
1395 ///
1396 /// # Errors
1397 ///
1398 /// fails, or [`BrowserError::ExtractionFailed`] if any field extraction
1399 /// fails.
1400 ///
1401 /// # Example
1402 ///
1403 /// ```ignore
1404 /// use stygian_browser::extract::Extract;
1405 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
1406 /// use std::time::Duration;
1407 ///
1408 /// #[derive(Extract)]
1409 /// struct Link {
1410 /// href: Option<String>,
1411 /// }
1412 ///
1413 /// # async fn run() -> stygian_browser::error::Result<()> {
1414 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1415 /// let handle = pool.acquire().await?;
1416 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
1417 /// page.navigate(
1418 /// "https://example.com",
1419 /// WaitUntil::DomContentLoaded,
1420 /// Duration::from_secs(30),
1421 /// ).await?;
1422 /// let links: Vec<Link> = page.extract_all::<Link>("nav li").await?;
1423 /// # Ok(())
1424 /// # }
1425 /// ```
1426 pub async fn extract_all<T>(&self, selector: &str) -> Result<Vec<T>>
1427 where
1428 T: crate::extract::Extractable,
1429 {
1430 use futures::future::try_join_all;
1431
1432 let nodes = self.query_selector_all(selector).await?;
1433 try_join_all(nodes.iter().map(|n| T::extract_from(n)))
1434 .await
1435 .map_err(BrowserError::ExtractionFailed)
1436 }
1437
1438 /// Try each selector in `selectors` in order and return the extracted
1439 /// results from the **first** selector that matches at least one node.
1440 ///
1441 /// This is useful when a page may use different markup across versions or
1442 /// A/B variants — supply the preferred selector first and progressively
1443 /// wider fallbacks afterwards.
1444 ///
1445 /// Returns an empty `Vec` only when *all* selectors match zero nodes
1446 /// (i.e. the element is genuinely absent from the page). A non-empty
1447 /// intermediate selector result that then fails during extraction **will**
1448 /// return an error.
1449 ///
1450 /// # Errors
1451 ///
1452 /// Returns [`BrowserError::CdpError`] if the selector query fails, or
1453 /// [`BrowserError::ExtractionFailed`] if a matched node fails extraction.
1454 ///
1455 /// # Example
1456 ///
1457 /// ```ignore
1458 /// use stygian_browser::extract::Extract;
1459 ///
1460 /// #[derive(Extract)]
1461 /// struct Headline { title: String }
1462 ///
1463 /// # async fn run(page: &stygian_browser::PageHandle) -> stygian_browser::error::Result<()> {
1464 /// // Try modern selector first, fall back to legacy markup.
1465 /// let items = page
1466 /// .extract_all_with_fallback::<Headline>(&["h2.headline", "h2.title", "h2"])
1467 /// .await?;
1468 /// # Ok(())
1469 /// # }
1470 /// ```
1471 pub async fn extract_all_with_fallback<T>(&self, selectors: &[&str]) -> Result<Vec<T>>
1472 where
1473 T: crate::extract::Extractable,
1474 {
1475 use futures::future::try_join_all;
1476
1477 for &selector in selectors {
1478 let nodes = self.query_selector_all(selector).await?;
1479 if nodes.is_empty() {
1480 continue;
1481 }
1482 return try_join_all(nodes.iter().map(|n| T::extract_from(n)))
1483 .await
1484 .map_err(BrowserError::ExtractionFailed);
1485 }
1486
1487 Ok(vec![])
1488 }
1489
1490 /// Extract from every node matching `selector`, **skipping** nodes where
1491 /// a required field is absent (i.e. [`ExtractionError::Missing`]).
1492 ///
1493 /// Unlike [`extract_all`], this method is lenient about structural
1494 /// mismatches: nodes that fail with [`ExtractionError::Missing`] are
1495 /// silently dropped from the result set. All other extraction errors
1496 /// (CDP failures, stale nodes, nested errors) still propagate as hard
1497 /// failures.
1498 ///
1499 /// This is useful when scraping heterogeneous lists where some items
1500 /// lack an optional field that your struct treats as required.
1501 ///
1502 /// [`extract_all`]: Self::extract_all
1503 /// [`ExtractionError::Missing`]: crate::extract::ExtractionError::Missing
1504 ///
1505 /// # Errors
1506 ///
1507 /// Returns [`BrowserError::CdpError`] if the selector query fails, or
1508 /// [`BrowserError::ExtractionFailed`] for non-`Missing` extraction errors.
1509 ///
1510 /// # Example
1511 ///
1512 /// ```ignore
1513 /// use stygian_browser::extract::Extract;
1514 ///
1515 /// #[derive(Extract)]
1516 /// struct Price { amount: String }
1517 ///
1518 /// # async fn run(page: &stygian_browser::PageHandle) -> stygian_browser::error::Result<()> {
1519 /// // Products without a price tag are silently skipped.
1520 /// let prices = page.extract_resilient::<Price>(".product").await?;
1521 /// # Ok(())
1522 /// # }
1523 /// ```
1524 pub async fn extract_resilient<T>(&self, selector: &str) -> Result<Vec<T>>
1525 where
1526 T: crate::extract::Extractable,
1527 {
1528 use crate::extract::ExtractionError;
1529
1530 let nodes = self.query_selector_all(selector).await?;
1531 let mut results = Vec::with_capacity(nodes.len());
1532
1533 for node in &nodes {
1534 match T::extract_from(node).await {
1535 Ok(item) => results.push(item),
1536 Err(ExtractionError::Missing { .. }) => {
1537 tracing::debug!(
1538 selector,
1539 "extract_resilient: skipping node with missing required field"
1540 );
1541 }
1542 Err(e) => return Err(BrowserError::ExtractionFailed(e)),
1543 }
1544 }
1545
1546 Ok(results)
1547 }
1548}
1549
1550// ─── similarity feature ──────────────────────────────────────────────────────
1551
1552#[cfg(feature = "similarity")]
1553impl NodeHandle {
1554 /// node.
1555 ///
1556 /// Issues a single `Runtime.callFunctionOn` JS eval that extracts the tag,
1557 /// class list, attribute names, and body-depth in one round-trip.
1558 ///
1559 /// # Errors
1560 ///
1561 /// invalidated, or [`BrowserError::ScriptExecutionFailed`] if the script
1562 /// produces unexpected output.
1563 pub async fn fingerprint(&self) -> Result<crate::similarity::ElementFingerprint> {
1564 const JS: &str = r"function() {
1565 var el = this;
1566 var tag = el.tagName.toLowerCase();
1567 var classes = Array.prototype.slice.call(el.classList).sort();
1568 var attrNames = Array.prototype.slice.call(el.attributes)
1569 .map(function(a) { return a.name; })
1570 .filter(function(n) { return n !== 'class' && n !== 'id'; })
1571 .sort();
1572 var depth = 0;
1573 var n = el.parentElement;
1574 while (n && n.tagName.toLowerCase() !== 'body') { depth++; n = n.parentElement; }
1575 return JSON.stringify({ tag: tag, classes: classes, attrNames: attrNames, depth: depth });
1576}";
1577
1578 let returns = tokio::time::timeout(self.cdp_timeout, self.element.call_js_fn(JS, true))
1579 .await
1580 .map_err(|_| BrowserError::Timeout {
1581 operation: "NodeHandle::fingerprint".to_string(),
1582 duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
1583 })?
1584 .map_err(|e| self.cdp_err_or_stale(&e, "fingerprint"))?;
1585
1586 let json_str = returns
1587 .result
1588 .value
1589 .as_ref()
1590 .and_then(|v| v.as_str())
1591 .ok_or_else(|| BrowserError::ScriptExecutionFailed {
1592 script: "NodeHandle::fingerprint".to_string(),
1593 reason: "CDP returned no string value from fingerprint script".to_string(),
1594 })?;
1595
1596 serde_json::from_str::<crate::similarity::ElementFingerprint>(json_str).map_err(|e| {
1597 BrowserError::ScriptExecutionFailed {
1598 script: "NodeHandle::fingerprint".to_string(),
1599 reason: format!("failed to deserialise fingerprint JSON: {e}"),
1600 }
1601 })
1602 }
1603}
1604
1605#[cfg(feature = "similarity")]
1606impl PageHandle {
1607 /// `reference`, scored by [`crate::similarity::SimilarityConfig`].
1608 ///
1609 /// [`NodeHandle::fingerprint`]), then fingerprints every candidate returned
1610 /// [`crate::similarity::jaccard_weighted`] score exceeds
1611 /// `config.threshold`. Results are ordered by score descending.
1612 ///
1613 /// # Example
1614 ///
1615 /// ```no_run
1616 /// use stygian_browser::{BrowserPool, BrowserConfig, WaitUntil};
1617 /// use stygian_browser::similarity::SimilarityConfig;
1618 /// use std::time::Duration;
1619 ///
1620 /// # async fn run() -> stygian_browser::error::Result<()> {
1621 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1622 /// let handle = pool.acquire().await?;
1623 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
1624 /// page.navigate("https://example.com", WaitUntil::DomContentLoaded, Duration::from_secs(30)).await?;
1625 ///
1626 /// # let nodes = page.query_selector_all("h1").await?;
1627 /// # let reference = nodes.into_iter().next().ok_or(stygian_browser::error::BrowserError::StaleNode { selector: "h1".to_string() })?;
1628 /// let similar = page.find_similar(&reference, SimilarityConfig::default()).await?;
1629 /// # for m in &similar {
1630 /// println!("score={:.2}", m.score);
1631 /// # }
1632 /// # Ok(())
1633 /// # }
1634 /// ```
1635 ///
1636 /// # Errors
1637 ///
1638 /// [`BrowserError::ScriptExecutionFailed`] if a scoring script fails.
1639 pub async fn find_similar(
1640 &self,
1641 reference: &NodeHandle,
1642 config: crate::similarity::SimilarityConfig,
1643 ) -> Result<Vec<crate::similarity::SimilarMatch>> {
1644 use crate::similarity::{SimilarMatch, jaccard_weighted};
1645
1646 let ref_fp = reference.fingerprint().await?;
1647 let candidates = self.query_selector_all("*").await?;
1648
1649 let mut matches: Vec<SimilarMatch> = Vec::new();
1650 for node in candidates {
1651 if let Ok(cand_fp) = node.fingerprint().await {
1652 let score = jaccard_weighted(&ref_fp, &cand_fp);
1653 if score >= config.threshold {
1654 matches.push(SimilarMatch { node, score });
1655 }
1656 }
1657 // Stale / detached nodes are silently skipped.
1658 }
1659
1660 matches.sort_by(|a, b| {
1661 b.score
1662 .partial_cmp(&a.score)
1663 .unwrap_or(std::cmp::Ordering::Equal)
1664 });
1665
1666 if config.max_results > 0 {
1667 matches.truncate(config.max_results);
1668 }
1669
1670 Ok(matches)
1671 }
1672}
1673
1674impl Drop for PageHandle {
1675 fn drop(&mut self) {
1676 warn!("PageHandle dropped without explicit close(); spawning cleanup task");
1677 // chromiumoxide Page does not implement close on Drop, so we spawn
1678 // swap it out. We clone the Page handle (it's Arc-backed internally).
1679 let page = self.page.clone();
1680 tokio::spawn(async move {
1681 let _ = page.close().await;
1682 });
1683 }
1684}
1685
1686// ─── Session warmup & refresh ─────────────────────────────────────────────────
1687
1688/// Simplified, JSON-serializable wait strategy used in [`WarmupOptions`] and
1689/// [`RefreshOptions`].
1690///
1691/// This is a serialization-friendly analogue of [`WaitUntil`]. Use
1692/// [`WarmupWait::into_wait_until`] to convert before calling
1693/// [`PageHandle::navigate`].
1694#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
1695#[serde(rename_all = "snake_case")]
1696pub enum WarmupWait {
1697 /// Wait until the HTML is fully parsed (`DOMContentLoaded`). This is the
1698 /// default and works for most pages.
1699 #[default]
1700 DomContentLoaded,
1701 /// Wait until there are no more than two in-flight network requests for at
1702 /// least 500 ms after navigation.
1703 NetworkIdle,
1704}
1705
1706impl WarmupWait {
1707 /// Convert into the lower-level [`WaitUntil`] enum.
1708 #[must_use]
1709 pub const fn into_wait_until(self) -> WaitUntil {
1710 match self {
1711 Self::DomContentLoaded => WaitUntil::DomContentLoaded,
1712 Self::NetworkIdle => WaitUntil::NetworkIdle,
1713 }
1714 }
1715}
1716
1717/// Options for [`PageHandle::warmup`].
1718///
1719/// # Example
1720///
1721/// ```
1722/// use stygian_browser::page::{WarmupOptions, WarmupWait};
1723///
1724/// let opts = WarmupOptions {
1725/// url: "https://example.com".to_string(),
1726/// wait: WarmupWait::DomContentLoaded,
1727/// timeout_ms: 30_000,
1728/// stabilize_ms: 500,
1729/// };
1730/// assert_eq!(opts.timeout_ms, 30_000);
1731/// ```
1732#[derive(Debug, Clone, Serialize, Deserialize)]
1733pub struct WarmupOptions {
1734 /// The URL to navigate to during warmup.
1735 pub url: String,
1736 /// Wait strategy applied after the navigation commit (default:
1737 /// `DomContentLoaded`).
1738 #[serde(default)]
1739 pub wait: WarmupWait,
1740 /// Navigation timeout in milliseconds. Default: `30 000`.
1741 #[serde(default = "WarmupOptions::default_timeout_ms")]
1742 pub timeout_ms: u64,
1743 /// Additional pause after navigation to let dynamic resources (XHR,
1744 /// lazy-loaded images) settle, in milliseconds. `0` disables the
1745 /// stabilization step (default).
1746 #[serde(default)]
1747 pub stabilize_ms: u64,
1748}
1749
1750impl WarmupOptions {
1751 /// Returns the default navigation timeout (30 000 ms).
1752 #[must_use]
1753 pub const fn default_timeout_ms() -> u64 {
1754 30_000
1755 }
1756}
1757
1758impl Default for WarmupOptions {
1759 fn default() -> Self {
1760 Self {
1761 url: String::new(),
1762 wait: WarmupWait::DomContentLoaded,
1763 timeout_ms: Self::default_timeout_ms(),
1764 stabilize_ms: 0,
1765 }
1766 }
1767}
1768
1769/// Diagnostic report produced by [`PageHandle::warmup`].
1770///
1771/// # Example
1772///
1773/// ```
1774/// use stygian_browser::page::WarmupReport;
1775/// let report = WarmupReport {
1776/// url: "https://example.com".to_string(),
1777/// elapsed_ms: 250,
1778/// status_code: Some(200),
1779/// title: "Example Domain".to_string(),
1780/// stabilized: false,
1781/// };
1782/// assert_eq!(report.status_code, Some(200));
1783/// ```
1784#[derive(Debug, Clone, Serialize, Deserialize)]
1785pub struct WarmupReport {
1786 /// The URL that was warmed.
1787 pub url: String,
1788 /// Elapsed wall-time in milliseconds.
1789 pub elapsed_ms: u64,
1790 /// HTTP status code of the warmup navigation, if captured by the
1791 /// `Network.responseReceived` listener.
1792 pub status_code: Option<u16>,
1793 /// Page title after warmup navigation.
1794 pub title: String,
1795 /// Whether a stabilization pause (`stabilize_ms > 0`) was applied after
1796 /// navigation.
1797 pub stabilized: bool,
1798}
1799
1800/// Options for [`PageHandle::refresh`].
1801///
1802/// # Example
1803///
1804/// ```
1805/// use stygian_browser::page::{RefreshOptions, WarmupWait};
1806///
1807/// let opts = RefreshOptions {
1808/// wait: WarmupWait::DomContentLoaded,
1809/// timeout_ms: 15_000,
1810/// reset_connection: true,
1811/// };
1812/// assert!(opts.reset_connection);
1813/// ```
1814#[derive(Debug, Clone, Serialize, Deserialize)]
1815pub struct RefreshOptions {
1816 /// Wait strategy applied after the reload (default: `DomContentLoaded`).
1817 #[serde(default)]
1818 pub wait: WarmupWait,
1819 /// Reload timeout in milliseconds. Default: `30 000`.
1820 #[serde(default = "RefreshOptions::default_timeout_ms")]
1821 pub timeout_ms: u64,
1822 /// When `true`, re-navigates to the current URL rather than issuing a
1823 /// browser-level reload. This signals to the calling code that a new TCP
1824 /// connection is desired while cookies and storage are retained in the
1825 /// browser process. Default: `false`.
1826 #[serde(default)]
1827 pub reset_connection: bool,
1828}
1829
1830impl RefreshOptions {
1831 /// Returns the default reload timeout (30 000 ms).
1832 #[must_use]
1833 pub const fn default_timeout_ms() -> u64 {
1834 30_000
1835 }
1836}
1837
1838impl Default for RefreshOptions {
1839 fn default() -> Self {
1840 Self {
1841 wait: WarmupWait::DomContentLoaded,
1842 timeout_ms: Self::default_timeout_ms(),
1843 reset_connection: false,
1844 }
1845 }
1846}
1847
1848/// Diagnostic report produced by [`PageHandle::refresh`].
1849///
1850/// # Example
1851///
1852/// ```
1853/// use stygian_browser::page::RefreshReport;
1854/// let report = RefreshReport {
1855/// url: "https://example.com".to_string(),
1856/// elapsed_ms: 180,
1857/// status_code: Some(200),
1858/// };
1859/// assert_eq!(report.elapsed_ms, 180);
1860/// ```
1861#[derive(Debug, Clone, Serialize, Deserialize)]
1862pub struct RefreshReport {
1863 /// URL of the page after the refresh navigation.
1864 pub url: String,
1865 /// Elapsed wall-time in milliseconds.
1866 pub elapsed_ms: u64,
1867 /// HTTP status code of the refresh navigation, if captured.
1868 pub status_code: Option<u16>,
1869}
1870
1871// ─── PageHandle warmup / refresh ──────────────────────────────────────────────
1872
1873impl PageHandle {
1874 /// Warm up a browser session by navigating to `options.url` and
1875 /// optionally waiting for dynamic resources to settle.
1876 ///
1877 /// Warmup is **idempotent**: calling it repeatedly re-navigates and
1878 /// re-warms the same session without adverse side effects.
1879 ///
1880 /// # Errors
1881 ///
1882 /// Returns [`BrowserError::NavigationFailed`] if the navigation times out
1883 /// or the underlying CDP call fails.
1884 ///
1885 /// # Example
1886 ///
1887 /// ```no_run
1888 /// # async fn run() -> stygian_browser::error::Result<()> {
1889 /// use stygian_browser::{BrowserPool, BrowserConfig};
1890 /// use stygian_browser::page::{WarmupOptions, WarmupWait};
1891 ///
1892 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1893 /// let handle = pool.acquire().await?;
1894 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
1895 ///
1896 /// let report = page.warmup(WarmupOptions {
1897 /// url: "https://example.com".to_string(),
1898 /// wait: WarmupWait::DomContentLoaded,
1899 /// timeout_ms: 30_000,
1900 /// stabilize_ms: 500,
1901 /// }).await?;
1902 /// println!("warmed in {}ms: {}", report.elapsed_ms, report.title);
1903 /// handle.release().await;
1904 /// # Ok(())
1905 /// # }
1906 /// ```
1907 pub async fn warmup(&mut self, options: WarmupOptions) -> Result<WarmupReport> {
1908 let start = std::time::Instant::now();
1909 let nav_timeout = Duration::from_millis(options.timeout_ms);
1910 self.navigate(
1911 &options.url,
1912 options.wait.clone().into_wait_until(),
1913 nav_timeout,
1914 )
1915 .await?;
1916 let status_code = self.status_code()?;
1917 let title = self.title().await.unwrap_or_default();
1918 let stabilized = options.stabilize_ms > 0;
1919 if stabilized {
1920 tokio::time::sleep(Duration::from_millis(options.stabilize_ms)).await;
1921 }
1922 let elapsed_ms = u64::try_from(start.elapsed().as_millis()).unwrap_or(u64::MAX);
1923 Ok(WarmupReport {
1924 url: options.url,
1925 elapsed_ms,
1926 status_code,
1927 title,
1928 stabilized,
1929 })
1930 }
1931
1932 /// Refresh the current page, retaining all in-browser session state
1933 /// (cookies, `localStorage`, `sessionStorage`).
1934 ///
1935 /// When `options.reset_connection` is `false` (default) a standard
1936 /// CDP reload is issued. When `true`, the current URL is re-navigated,
1937 /// which expresses the caller's intent to force a new underlying TCP/TLS
1938 /// connection while keeping all browser-side state intact.
1939 ///
1940 /// Refresh is **idempotent**: repeated calls simply reload the page again.
1941 ///
1942 /// # Errors
1943 ///
1944 /// Returns [`BrowserError::NavigationFailed`] if the current URL cannot be
1945 /// determined or the reload times out.
1946 ///
1947 /// # Example
1948 ///
1949 /// ```no_run
1950 /// # async fn run() -> stygian_browser::error::Result<()> {
1951 /// use stygian_browser::{BrowserPool, BrowserConfig};
1952 /// use stygian_browser::page::{RefreshOptions, WaitUntil};
1953 ///
1954 /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
1955 /// let handle = pool.acquire().await?;
1956 /// let mut page = handle.browser().expect("valid browser").new_page().await?;
1957 /// page.navigate(
1958 /// "https://example.com",
1959 /// WaitUntil::DomContentLoaded,
1960 /// std::time::Duration::from_secs(30),
1961 /// ).await?;
1962 ///
1963 /// let report = page.refresh(RefreshOptions::default()).await?;
1964 /// println!("refreshed in {}ms", report.elapsed_ms);
1965 /// handle.release().await;
1966 /// # Ok(())
1967 /// # }
1968 /// ```
1969 pub async fn refresh(&mut self, options: RefreshOptions) -> Result<RefreshReport> {
1970 let start = std::time::Instant::now();
1971 let nav_timeout = Duration::from_millis(options.timeout_ms);
1972 let wait = options.wait.clone().into_wait_until();
1973 // Resolve the current URL before any navigation changes it.
1974 let current_url = self.url().await?;
1975 if current_url.is_empty() || current_url == "about:blank" {
1976 return Err(BrowserError::NavigationFailed {
1977 url: current_url,
1978 reason: "page has not been navigated yet; call warmup() or navigate() first"
1979 .to_string(),
1980 });
1981 }
1982 // Both code paths navigate to the same URL. `reset_connection: true`
1983 // expresses the *intent* to use a new TCP connection; the browser is free
1984 // to reuse or create a new connection as its connection pool dictates.
1985 self.navigate(¤t_url, wait, nav_timeout).await?;
1986 let status_code = self.status_code()?;
1987 let url = self.url().await?;
1988 let elapsed_ms = u64::try_from(start.elapsed().as_millis()).unwrap_or(u64::MAX);
1989 Ok(RefreshReport {
1990 url,
1991 elapsed_ms,
1992 status_code,
1993 })
1994 }
1995}
1996
1997// ─── Tests ────────────────────────────────────────────────────────────────────
1998
1999#[cfg(test)]
2000mod tests {
2001 use super::*;
2002
2003 #[test]
2004 fn resource_filter_block_media_blocks_image() {
2005 let filter = ResourceFilter::block_media();
2006 assert!(filter.should_block("Image"));
2007 assert!(filter.should_block("Font"));
2008 assert!(filter.should_block("Stylesheet"));
2009 assert!(filter.should_block("Media"));
2010 assert!(!filter.should_block("Script"));
2011 assert!(!filter.should_block("XHR"));
2012 }
2013
2014 #[test]
2015 fn resource_filter_case_insensitive() {
2016 let filter = ResourceFilter::block_images_and_fonts();
2017 assert!(filter.should_block("image")); // lowercase
2018 assert!(filter.should_block("IMAGE")); // uppercase
2019 assert!(!filter.should_block("Stylesheet"));
2020 }
2021
2022 #[test]
2023 fn resource_filter_builder_chain() {
2024 let filter = ResourceFilter::default()
2025 .block(ResourceType::Image)
2026 .block(ResourceType::Font);
2027 assert!(filter.should_block("Image"));
2028 assert!(filter.should_block("Font"));
2029 assert!(!filter.should_block("Stylesheet"));
2030 }
2031
2032 #[test]
2033 fn resource_filter_dedup_block() {
2034 let filter = ResourceFilter::default()
2035 .block(ResourceType::Image)
2036 .block(ResourceType::Image); // duplicate
2037 assert_eq!(filter.blocked.len(), 1);
2038 }
2039
2040 #[test]
2041 fn resource_filter_is_empty_when_default() {
2042 assert!(ResourceFilter::default().is_empty());
2043 assert!(!ResourceFilter::block_media().is_empty());
2044 }
2045
2046 #[test]
2047 fn wait_until_selector_stores_string() {
2048 let w = WaitUntil::Selector("#foo".to_string());
2049 assert!(matches!(w, WaitUntil::Selector(ref s) if s == "#foo"));
2050 }
2051
2052 #[test]
2053 fn resource_type_cdp_str() {
2054 assert_eq!(ResourceType::Image.as_cdp_str(), "Image");
2055 assert_eq!(ResourceType::Font.as_cdp_str(), "Font");
2056 assert_eq!(ResourceType::Stylesheet.as_cdp_str(), "Stylesheet");
2057 assert_eq!(ResourceType::Media.as_cdp_str(), "Media");
2058 }
2059
2060 #[test]
2061 fn page_handle_is_send_sync() {
2062 fn assert_send<T: Send>() {}
2063 fn assert_sync<T: Sync>() {}
2064 assert_send::<PageHandle>();
2065 assert_sync::<PageHandle>();
2066 }
2067
2068 /// Verify the resilient extractor correctly classifies `ExtractionError`
2069 /// variants — `Missing` must be treated as "skip", others as hard errors.
2070 #[cfg(feature = "extract")]
2071 #[test]
2072 fn extraction_error_missing_is_skippable() {
2073 use crate::extract::ExtractionError;
2074
2075 let missing = ExtractionError::Missing {
2076 field: "title",
2077 selector: "h1",
2078 };
2079 assert!(
2080 matches!(missing, ExtractionError::Missing { .. }),
2081 "ExtractionError::Missing should be the skip variant"
2082 );
2083
2084 // Non-Missing variants should NOT match the skip pattern
2085 let nested = ExtractionError::Nested {
2086 field: "link",
2087 source: Box::new(ExtractionError::Missing {
2088 field: "href",
2089 selector: "a",
2090 }),
2091 };
2092 assert!(
2093 !matches!(nested, ExtractionError::Missing { .. }),
2094 "ExtractionError::Nested must not match Missing"
2095 );
2096 }
2097
2098 /// `Option<u16>` are pure-logic invariants testable without a live browser.
2099 #[test]
2100 fn status_code_sentinel_zero_maps_to_none() {
2101 use std::sync::atomic::{AtomicU16, Ordering};
2102 let atom = AtomicU16::new(0);
2103 let code = atom.load(Ordering::Acquire);
2104 assert_eq!(if code == 0 { None } else { Some(code) }, None::<u16>);
2105 }
2106
2107 #[test]
2108 fn status_code_non_zero_maps_to_some() {
2109 use std::sync::atomic::{AtomicU16, Ordering};
2110 for &expected in &[200u16, 301, 404, 503] {
2111 let atom = AtomicU16::new(expected);
2112 let code = atom.load(Ordering::Acquire);
2113 assert_eq!(if code == 0 { None } else { Some(code) }, Some(expected));
2114 }
2115 }
2116
2117 // ── NodeHandle pure-logic tests ───────────────────────────────────────────
2118
2119 /// `attr_map` relies on `chunks_exact(2)` — verify the pairing logic is
2120 /// correct without a live browser by exercising it directly.
2121 #[test]
2122 fn attr_map_chunking_pairs_correctly() {
2123 let flat = [
2124 "id".to_string(),
2125 "main".to_string(),
2126 "data-ux".to_string(),
2127 "Section".to_string(),
2128 "class".to_string(),
2129 "container".to_string(),
2130 ];
2131 let mut map = std::collections::HashMap::with_capacity(flat.len() / 2);
2132 for pair in flat.chunks_exact(2) {
2133 if let [name, value] = pair {
2134 map.insert(name.clone(), value.clone());
2135 }
2136 }
2137 assert_eq!(map.get("id").map(String::as_str), Some("main"));
2138 assert_eq!(map.get("data-ux").map(String::as_str), Some("Section"));
2139 assert_eq!(map.get("class").map(String::as_str), Some("container"));
2140 assert_eq!(map.len(), 3);
2141 }
2142
2143 /// gracefully — the trailing element is silently ignored.
2144 #[test]
2145 fn attr_map_chunking_ignores_odd_trailing() {
2146 let flat = ["orphan".to_string()]; // no value
2147 let mut map = std::collections::HashMap::new();
2148 for pair in flat.chunks_exact(2) {
2149 if let [name, value] = pair {
2150 map.insert(name.clone(), value.clone());
2151 }
2152 }
2153 assert!(map.is_empty());
2154 }
2155
2156 /// Empty flat list → empty map.
2157 #[test]
2158 fn attr_map_chunking_empty_input() {
2159 let flat: Vec<String> = vec![];
2160 let map: std::collections::HashMap<String, String> = flat
2161 .chunks_exact(2)
2162 .filter_map(|pair| {
2163 if let [name, value] = pair {
2164 Some((name.clone(), value.clone()))
2165 } else {
2166 None
2167 }
2168 })
2169 .collect();
2170 assert!(map.is_empty());
2171 }
2172
2173 #[test]
2174 fn ancestors_json_parse_round_trip() -> std::result::Result<(), serde_json::Error> {
2175 let json = r#"["p","article","body","html"]"#;
2176 let result: Vec<String> = serde_json::from_str(json)?;
2177 assert_eq!(result, ["p", "article", "body", "html"]);
2178 Ok(())
2179 }
2180
2181 #[test]
2182 fn ancestors_json_parse_empty() -> std::result::Result<(), serde_json::Error> {
2183 let json = "[]";
2184 let result: Vec<String> = serde_json::from_str(json)?;
2185 assert!(result.is_empty());
2186 Ok(())
2187 }
2188
2189 /// `"div::parent"`) must surface that suffix in its `Display` output so
2190 /// callers can locate the failed traversal in logs.
2191 #[test]
2192 fn traversal_selector_suffix_in_stale_error() {
2193 let e = crate::error::BrowserError::StaleNode {
2194 selector: "div::parent".to_string(),
2195 };
2196 let msg = e.to_string();
2197 assert!(
2198 msg.contains("div::parent"),
2199 "StaleNode display must include the full selector; got: {msg}"
2200 );
2201 }
2202
2203 #[test]
2204 fn traversal_next_suffix_in_stale_error() {
2205 let e = crate::error::BrowserError::StaleNode {
2206 selector: "li.price::next".to_string(),
2207 };
2208 assert!(e.to_string().contains("li.price::next"));
2209 }
2210
2211 #[test]
2212 fn traversal_prev_suffix_in_stale_error() {
2213 let e = crate::error::BrowserError::StaleNode {
2214 selector: "td.label::prev".to_string(),
2215 };
2216 assert!(e.to_string().contains("td.label::prev"));
2217 }
2218
2219 // ── Warmup / Refresh type tests ───────────────────────────────────────────
2220
2221 #[test]
2222 fn warmup_options_defaults() {
2223 let opts = WarmupOptions::default();
2224 assert_eq!(opts.wait, WarmupWait::DomContentLoaded);
2225 assert_eq!(opts.timeout_ms, WarmupOptions::default_timeout_ms());
2226 assert_eq!(opts.stabilize_ms, 0);
2227 }
2228
2229 #[test]
2230 fn warmup_options_serialize_round_trip() -> std::result::Result<(), Box<dyn std::error::Error>>
2231 {
2232 let opts = WarmupOptions {
2233 url: "https://example.com".to_string(),
2234 wait: WarmupWait::NetworkIdle,
2235 timeout_ms: 15_000,
2236 stabilize_ms: 250,
2237 };
2238 let json = serde_json::to_string(&opts)?;
2239 let restored: WarmupOptions = serde_json::from_str(&json)?;
2240 assert_eq!(restored.url, "https://example.com");
2241 assert_eq!(restored.wait, WarmupWait::NetworkIdle);
2242 assert_eq!(restored.timeout_ms, 15_000);
2243 assert_eq!(restored.stabilize_ms, 250);
2244 Ok(())
2245 }
2246
2247 #[test]
2248 fn warmup_wait_default_is_dom_content_loaded() {
2249 assert_eq!(WarmupWait::default(), WarmupWait::DomContentLoaded);
2250 }
2251
2252 #[test]
2253 fn warmup_wait_into_wait_until_variants() {
2254 assert!(matches!(
2255 WarmupWait::DomContentLoaded.into_wait_until(),
2256 WaitUntil::DomContentLoaded
2257 ));
2258 assert!(matches!(
2259 WarmupWait::NetworkIdle.into_wait_until(),
2260 WaitUntil::NetworkIdle
2261 ));
2262 }
2263
2264 #[test]
2265 fn refresh_options_defaults() {
2266 let opts = RefreshOptions::default();
2267 assert_eq!(opts.wait, WarmupWait::DomContentLoaded);
2268 assert_eq!(opts.timeout_ms, RefreshOptions::default_timeout_ms());
2269 assert!(!opts.reset_connection);
2270 }
2271
2272 #[test]
2273 fn refresh_options_serialize_round_trip() -> std::result::Result<(), Box<dyn std::error::Error>>
2274 {
2275 let opts = RefreshOptions {
2276 wait: WarmupWait::NetworkIdle,
2277 timeout_ms: 10_000,
2278 reset_connection: true,
2279 };
2280 let json = serde_json::to_string(&opts)?;
2281 let restored: RefreshOptions = serde_json::from_str(&json)?;
2282 assert_eq!(restored.wait, WarmupWait::NetworkIdle);
2283 assert_eq!(restored.timeout_ms, 10_000);
2284 assert!(restored.reset_connection);
2285 Ok(())
2286 }
2287
2288 #[test]
2289 fn warmup_report_serialize_round_trip() -> std::result::Result<(), Box<dyn std::error::Error>> {
2290 let report = WarmupReport {
2291 url: "https://example.com".to_string(),
2292 elapsed_ms: 320,
2293 status_code: Some(200),
2294 title: "Example Domain".to_string(),
2295 stabilized: true,
2296 };
2297 let json = serde_json::to_string(&report)?;
2298 let restored: WarmupReport = serde_json::from_str(&json)?;
2299 assert_eq!(restored.url, "https://example.com");
2300 assert_eq!(restored.elapsed_ms, 320);
2301 assert_eq!(restored.status_code, Some(200));
2302 assert_eq!(restored.title, "Example Domain");
2303 assert!(restored.stabilized);
2304 Ok(())
2305 }
2306
2307 #[test]
2308 fn refresh_report_serialize_round_trip() -> std::result::Result<(), Box<dyn std::error::Error>>
2309 {
2310 let report = RefreshReport {
2311 url: "https://example.com/".to_string(),
2312 elapsed_ms: 180,
2313 status_code: Some(304),
2314 };
2315 let json = serde_json::to_string(&report)?;
2316 let restored: RefreshReport = serde_json::from_str(&json)?;
2317 assert_eq!(restored.url, "https://example.com/");
2318 assert_eq!(restored.elapsed_ms, 180);
2319 assert_eq!(restored.status_code, Some(304));
2320 Ok(())
2321 }
2322
2323 #[test]
2324 fn warmup_options_missing_stabilize_ms_defaults_to_zero()
2325 -> std::result::Result<(), Box<dyn std::error::Error>> {
2326 // stabilize_ms has `#[serde(default)]`; omitting it from JSON should
2327 // deserialize to 0 rather than erroring.
2328 let json = r#"{"url":"https://example.com","timeout_ms":30000}"#;
2329 let opts: WarmupOptions = serde_json::from_str(json)?;
2330 assert_eq!(opts.stabilize_ms, 0);
2331 Ok(())
2332 }
2333
2334 // ── Integration tests (require live Chrome — skipped in CI) ──────────────
2335
2336 /// Warm up a page then immediately extract content from the same origin.
2337 #[test]
2338 #[ignore = "requires live Chrome"]
2339 #[allow(clippy::expect_used)]
2340 fn integration_warmup_then_extraction() {
2341 let rt = tokio::runtime::Runtime::new().expect("tokio runtime");
2342 rt.block_on(async {
2343 use crate::{BrowserConfig, BrowserPool};
2344 let pool = BrowserPool::new(BrowserConfig::default())
2345 .await
2346 .expect("pool");
2347 let handle = pool.acquire().await.expect("handle");
2348 let mut page = handle
2349 .browser()
2350 .expect("browser")
2351 .new_page()
2352 .await
2353 .expect("page");
2354
2355 let report = page
2356 .warmup(WarmupOptions {
2357 url: "https://example.com".to_string(),
2358 wait: WarmupWait::DomContentLoaded,
2359 timeout_ms: 30_000,
2360 stabilize_ms: 0,
2361 })
2362 .await
2363 .expect("warmup");
2364
2365 assert!(!report.title.is_empty(), "title populated after warmup");
2366 assert!(report.elapsed_ms > 0);
2367
2368 // Confirm the page is still usable for further queries.
2369 let html = page.content().await.expect("content");
2370 assert!(
2371 html.contains("example"),
2372 "page content available after warmup"
2373 );
2374
2375 page.close().await.expect("close");
2376 handle.release().await;
2377 });
2378 }
2379
2380 /// Refresh a page and verify session continuity (URL unchanged, page
2381 /// still navigable).
2382 #[test]
2383 #[ignore = "requires live Chrome"]
2384 #[allow(clippy::expect_used)]
2385 fn integration_refresh_keeps_session_state() {
2386 let rt = tokio::runtime::Runtime::new().expect("tokio runtime");
2387 rt.block_on(async {
2388 use crate::{BrowserConfig, BrowserPool};
2389 let pool = BrowserPool::new(BrowserConfig::default())
2390 .await
2391 .expect("pool");
2392 let handle = pool.acquire().await.expect("handle");
2393 let mut page = handle
2394 .browser()
2395 .expect("browser")
2396 .new_page()
2397 .await
2398 .expect("page");
2399
2400 page.navigate(
2401 "https://example.com",
2402 WaitUntil::DomContentLoaded,
2403 Duration::from_secs(30),
2404 )
2405 .await
2406 .expect("initial navigate");
2407
2408 let report = page
2409 .refresh(RefreshOptions::default())
2410 .await
2411 .expect("refresh");
2412
2413 assert!(
2414 report.url.contains("example.com"),
2415 "URL retained after refresh; got: {}",
2416 report.url
2417 );
2418 assert!(report.elapsed_ms > 0);
2419
2420 page.close().await.expect("close");
2421 handle.release().await;
2422 });
2423 }
2424}