use std::collections::HashMap;
use std::sync::{
Arc,
atomic::{AtomicU16, Ordering},
};
use std::time::Duration;
use chromiumoxide::Page;
use tokio::time::timeout;
use tracing::{debug, warn};
use crate::error::{BrowserError, Result};
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ResourceType {
Image,
Font,
Stylesheet,
Media,
}
impl ResourceType {
pub const fn as_cdp_str(&self) -> &'static str {
match self {
Self::Image => "Image",
Self::Font => "Font",
Self::Stylesheet => "Stylesheet",
Self::Media => "Media",
}
}
}
#[derive(Debug, Clone, Default)]
pub struct ResourceFilter {
blocked: Vec<ResourceType>,
}
impl ResourceFilter {
pub fn block_media() -> Self {
Self {
blocked: vec![
ResourceType::Image,
ResourceType::Font,
ResourceType::Stylesheet,
ResourceType::Media,
],
}
}
pub fn block_images_and_fonts() -> Self {
Self {
blocked: vec![ResourceType::Image, ResourceType::Font],
}
}
#[must_use]
pub fn block(mut self, resource: ResourceType) -> Self {
if !self.blocked.contains(&resource) {
self.blocked.push(resource);
}
self
}
pub fn should_block(&self, cdp_type: &str) -> bool {
self.blocked
.iter()
.any(|r| r.as_cdp_str().eq_ignore_ascii_case(cdp_type))
}
pub const fn is_empty(&self) -> bool {
self.blocked.is_empty()
}
}
#[derive(Debug, Clone)]
pub enum WaitUntil {
DomContentLoaded,
NetworkIdle,
Selector(String),
}
pub struct NodeHandle {
element: chromiumoxide::element::Element,
selector: Arc<str>,
cdp_timeout: Duration,
page: chromiumoxide::Page,
}
impl NodeHandle {
pub async fn attr(&self, name: &str) -> Result<Option<String>> {
timeout(self.cdp_timeout, self.element.attribute(name))
.await
.map_err(|_| BrowserError::Timeout {
operation: "NodeHandle::attr".to_string(),
duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
})?
.map_err(|e| self.cdp_err_or_stale(&e, "attr"))
}
pub async fn attr_map(&self) -> Result<HashMap<String, String>> {
let flat = timeout(self.cdp_timeout, self.element.attributes())
.await
.map_err(|_| BrowserError::Timeout {
operation: "NodeHandle::attr_map".to_string(),
duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
})?
.map_err(|e| self.cdp_err_or_stale(&e, "attr_map"))?;
let mut map = HashMap::with_capacity(flat.len() / 2);
for pair in flat.chunks_exact(2) {
if let [name, value] = pair {
map.insert(name.clone(), value.clone());
}
}
Ok(map)
}
pub async fn text_content(&self) -> Result<String> {
let returns = timeout(
self.cdp_timeout,
self.element
.call_js_fn(r"function() { return this.textContent ?? ''; }", true),
)
.await
.map_err(|_| BrowserError::Timeout {
operation: "NodeHandle::text_content".to_string(),
duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
})?
.map_err(|e| self.cdp_err_or_stale(&e, "text_content"))?;
Ok(returns
.result
.value
.as_ref()
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string())
}
pub async fn inner_html(&self) -> Result<String> {
timeout(self.cdp_timeout, self.element.inner_html())
.await
.map_err(|_| BrowserError::Timeout {
operation: "NodeHandle::inner_html".to_string(),
duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
})?
.map_err(|e| self.cdp_err_or_stale(&e, "inner_html"))
.map(Option::unwrap_or_default)
}
pub async fn outer_html(&self) -> Result<String> {
timeout(self.cdp_timeout, self.element.outer_html())
.await
.map_err(|_| BrowserError::Timeout {
operation: "NodeHandle::outer_html".to_string(),
duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
})?
.map_err(|e| self.cdp_err_or_stale(&e, "outer_html"))
.map(Option::unwrap_or_default)
}
pub async fn ancestors(&self) -> Result<Vec<String>> {
let returns = timeout(
self.cdp_timeout,
self.element.call_js_fn(
r"function() {
const a = [];
let n = this.parentElement;
while (n) { a.push(n.tagName.toLowerCase()); n = n.parentElement; }
return a;
}",
true,
),
)
.await
.map_err(|_| BrowserError::Timeout {
operation: "NodeHandle::ancestors".to_string(),
duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
})?
.map_err(|e| self.cdp_err_or_stale(&e, "ancestors"))?;
let arr = returns
.result
.value
.as_ref()
.and_then(|v| v.as_array())
.ok_or_else(|| BrowserError::ScriptExecutionFailed {
script: "NodeHandle::ancestors".to_string(),
reason: "CDP returned no value or a non-array value for ancestors()".to_string(),
})?;
arr.iter()
.map(|v| {
v.as_str().map(ToString::to_string).ok_or_else(|| {
BrowserError::ScriptExecutionFailed {
script: "NodeHandle::ancestors".to_string(),
reason: format!("ancestor entry is not a string: {v}"),
}
})
})
.collect()
}
pub async fn children_matching(&self, selector: &str) -> Result<Vec<Self>> {
let elements = timeout(self.cdp_timeout, self.element.find_elements(selector))
.await
.map_err(|_| BrowserError::Timeout {
operation: "NodeHandle::children_matching".to_string(),
duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
})?
.map_err(|e| self.cdp_err_or_stale(&e, "children_matching"))?;
let selector_arc: Arc<str> = Arc::from(selector);
Ok(elements
.into_iter()
.map(|el| Self {
element: el,
selector: selector_arc.clone(),
cdp_timeout: self.cdp_timeout,
page: self.page.clone(),
})
.collect())
}
pub async fn parent(&self) -> Result<Option<Self>> {
let attr = format!(
"data-stygian-t-{}",
ulid::Ulid::new().to_string().to_lowercase()
);
let js = format!(
"function() {{ \
var t = this.parentElement; \
if (!t) {{ return false; }} \
t.setAttribute('{attr}', '1'); \
return true; \
}}"
);
self.call_traversal(&js, &attr, "parent").await
}
pub async fn next_sibling(&self) -> Result<Option<Self>> {
let attr = format!(
"data-stygian-t-{}",
ulid::Ulid::new().to_string().to_lowercase()
);
let js = format!(
"function() {{ \
var t = this.nextElementSibling; \
if (!t) {{ return false; }} \
t.setAttribute('{attr}', '1'); \
return true; \
}}"
);
self.call_traversal(&js, &attr, "next").await
}
pub async fn previous_sibling(&self) -> Result<Option<Self>> {
let attr = format!(
"data-stygian-t-{}",
ulid::Ulid::new().to_string().to_lowercase()
);
let js = format!(
"function() {{ \
var t = this.previousElementSibling; \
if (!t) {{ return false; }} \
t.setAttribute('{attr}', '1'); \
return true; \
}}"
);
self.call_traversal(&js, &attr, "prev").await
}
async fn call_traversal(
&self,
js_fn: &str,
attr_name: &str,
selector_suffix: &str,
) -> Result<Option<Self>> {
let op_tag = format!("NodeHandle::{selector_suffix}::tag");
let returns = timeout(self.cdp_timeout, self.element.call_js_fn(js_fn, false))
.await
.map_err(|_| BrowserError::Timeout {
operation: op_tag.clone(),
duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
})?
.map_err(|e| self.cdp_err_or_stale(&e, selector_suffix))?;
let has_target = returns
.result
.value
.as_ref()
.and_then(serde_json::Value::as_bool)
.unwrap_or(false);
if !has_target {
return Ok(None);
}
let css = format!("[{attr_name}]");
let op_resolve = format!("NodeHandle::{selector_suffix}::resolve");
let element = timeout(self.cdp_timeout, self.page.find_element(css))
.await
.map_err(|_| BrowserError::Timeout {
operation: op_resolve.clone(),
duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
})?
.map_err(|e| BrowserError::CdpError {
operation: op_resolve,
message: e.to_string(),
})?;
let cleanup = format!("function() {{ this.removeAttribute('{attr_name}'); }}");
let _ = element.call_js_fn(cleanup, false).await;
let new_selector: Arc<str> =
Arc::from(format!("{}::{selector_suffix}", self.selector).as_str());
Ok(Some(Self {
element,
selector: new_selector,
cdp_timeout: self.cdp_timeout,
page: self.page.clone(),
}))
}
fn cdp_err_or_stale(
&self,
err: &chromiumoxide::error::CdpError,
operation: &str,
) -> BrowserError {
let msg = err.to_string();
if msg.contains("Cannot find object with id")
|| msg.contains("context with specified id")
|| msg.contains("Cannot find context")
{
BrowserError::StaleNode {
selector: self.selector.to_string(),
}
} else {
BrowserError::CdpError {
operation: operation.to_string(),
message: msg,
}
}
}
}
pub struct PageHandle {
page: Page,
cdp_timeout: Duration,
last_status_code: Arc<AtomicU16>,
resource_filter_task: Option<tokio::task::JoinHandle<()>>,
}
impl PageHandle {
pub(crate) fn new(page: Page, cdp_timeout: Duration) -> Self {
Self {
page,
cdp_timeout,
last_status_code: Arc::new(AtomicU16::new(0)),
resource_filter_task: None,
}
}
pub async fn navigate(
&mut self,
url: &str,
condition: WaitUntil,
nav_timeout: Duration,
) -> Result<()> {
self.setup_status_capture().await;
timeout(
nav_timeout,
self.navigate_inner(url, condition, nav_timeout),
)
.await
.map_err(|_| BrowserError::NavigationFailed {
url: url.to_string(),
reason: format!("navigation timed out after {nav_timeout:?}"),
})?
}
async fn setup_status_capture(&self) {
use chromiumoxide::cdp::browser_protocol::network::{
EventResponseReceived, ResourceType as NetworkResourceType,
};
use futures::StreamExt;
self.last_status_code.store(0, Ordering::Release);
let page_for_listener = self.page.clone();
let status_capture = Arc::clone(&self.last_status_code);
match page_for_listener
.event_listener::<EventResponseReceived>()
.await
{
Ok(mut stream) => {
tokio::spawn(async move {
while let Some(event) = stream.next().await {
if event.r#type == NetworkResourceType::Document {
let code = u16::try_from(event.response.status).unwrap_or(0);
if code > 0 {
status_capture.store(code, Ordering::Release);
}
break;
}
}
});
}
Err(e) => warn!("status-code capture unavailable: {e}"),
}
}
async fn navigate_inner(
&self,
url: &str,
condition: WaitUntil,
nav_timeout: Duration,
) -> Result<()> {
use chromiumoxide::cdp::browser_protocol::page::{
EventDomContentEventFired, EventLoadEventFired,
};
use futures::StreamExt;
let url_owned = url.to_string();
let mut dom_events = match &condition {
WaitUntil::DomContentLoaded => Some(
self.page
.event_listener::<EventDomContentEventFired>()
.await
.map_err(|e| BrowserError::NavigationFailed {
url: url_owned.clone(),
reason: e.to_string(),
})?,
),
_ => None,
};
let mut load_events = match &condition {
WaitUntil::NetworkIdle => Some(
self.page
.event_listener::<EventLoadEventFired>()
.await
.map_err(|e| BrowserError::NavigationFailed {
url: url_owned.clone(),
reason: e.to_string(),
})?,
),
_ => None,
};
let inflight = if matches!(condition, WaitUntil::NetworkIdle) {
Some(self.subscribe_inflight_counter().await)
} else {
None
};
self.page
.goto(url)
.await
.map_err(|e| BrowserError::NavigationFailed {
url: url_owned.clone(),
reason: e.to_string(),
})?;
match &condition {
WaitUntil::DomContentLoaded => {
if let Some(ref mut events) = dom_events {
let _ = events.next().await;
}
}
WaitUntil::NetworkIdle => {
if let Some(ref mut events) = load_events {
let _ = events.next().await;
}
if let Some(ref counter) = inflight {
Self::wait_network_idle(counter).await;
}
}
WaitUntil::Selector(css) => {
self.wait_for_selector(css, nav_timeout).await?;
}
}
Ok(())
}
async fn subscribe_inflight_counter(&self) -> Arc<std::sync::atomic::AtomicI32> {
use std::sync::atomic::AtomicI32;
use chromiumoxide::cdp::browser_protocol::network::{
EventLoadingFailed, EventLoadingFinished, EventRequestWillBeSent,
};
use futures::StreamExt;
let counter: Arc<AtomicI32> = Arc::new(AtomicI32::new(0));
let pairs: [(Arc<AtomicI32>, i32); 3] = [
(Arc::clone(&counter), 1),
(Arc::clone(&counter), -1),
(Arc::clone(&counter), -1),
];
let [p1, p2, p3] = [self.page.clone(), self.page.clone(), self.page.clone()];
macro_rules! spawn_tracker {
($page:expr, $event:ty, $c:expr, $delta:expr) => {
match $page.event_listener::<$event>().await {
Ok(mut s) => {
let c = $c;
let d = $delta;
tokio::spawn(async move {
while s.next().await.is_some() {
c.fetch_add(d, Ordering::Relaxed);
}
});
}
Err(e) => warn!("network-idle tracker unavailable: {e}"),
}
};
}
let [(c1, d1), (c2, d2), (c3, d3)] = pairs;
spawn_tracker!(p1, EventRequestWillBeSent, c1, d1);
spawn_tracker!(p2, EventLoadingFinished, c2, d2);
spawn_tracker!(p3, EventLoadingFailed, c3, d3);
counter
}
async fn wait_network_idle(counter: &Arc<std::sync::atomic::AtomicI32>) {
const IDLE_THRESHOLD: i32 = 2;
const SETTLE: Duration = Duration::from_millis(500);
loop {
if counter.load(Ordering::Relaxed) <= IDLE_THRESHOLD {
tokio::time::sleep(SETTLE).await;
if counter.load(Ordering::Relaxed) <= IDLE_THRESHOLD {
break;
}
} else {
tokio::time::sleep(Duration::from_millis(50)).await;
}
}
}
pub async fn wait_for_selector(&self, selector: &str, wait_timeout: Duration) -> Result<()> {
let selector_owned = selector.to_string();
let poll = async {
loop {
if self.page.find_element(selector_owned.clone()).await.is_ok() {
return Ok(());
}
tokio::time::sleep(Duration::from_millis(100)).await;
}
};
timeout(wait_timeout, poll)
.await
.map_err(|_| BrowserError::NavigationFailed {
url: String::new(),
reason: format!("selector '{selector_owned}' not found within {wait_timeout:?}"),
})?
}
pub async fn set_resource_filter(&mut self, filter: ResourceFilter) -> Result<()> {
use chromiumoxide::cdp::browser_protocol::fetch::{
ContinueRequestParams, EnableParams, EventRequestPaused, FailRequestParams,
RequestPattern,
};
use chromiumoxide::cdp::browser_protocol::network::ErrorReason;
use futures::StreamExt as _;
if filter.is_empty() {
return Ok(());
}
if let Some(task) = self.resource_filter_task.take() {
task.abort();
}
let pattern = RequestPattern::builder().url_pattern("*").build();
let params = EnableParams::builder()
.patterns(vec![pattern])
.handle_auth_requests(false)
.build();
timeout(self.cdp_timeout, self.page.execute::<EnableParams>(params))
.await
.map_err(|_| BrowserError::Timeout {
operation: "Fetch.enable".to_string(),
duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
})?
.map_err(|e| BrowserError::CdpError {
operation: "Fetch.enable".to_string(),
message: e.to_string(),
})?;
let mut events = self
.page
.event_listener::<EventRequestPaused>()
.await
.map_err(|e| BrowserError::CdpError {
operation: "Fetch.requestPaused subscribe".to_string(),
message: e.to_string(),
})?;
let page = self.page.clone();
debug!("Resource filter active: {:?}", filter);
let task = tokio::spawn(async move {
while let Some(event) = events.next().await {
let request_id = event.request_id.clone();
if filter.should_block(event.resource_type.as_ref()) {
let params = FailRequestParams::new(request_id, ErrorReason::BlockedByClient);
let _ = page.execute(params).await;
} else {
let _ = page.execute(ContinueRequestParams::new(request_id)).await;
}
}
});
self.resource_filter_task = Some(task);
Ok(())
}
pub async fn url(&self) -> Result<String> {
timeout(self.cdp_timeout, self.page.url())
.await
.map_err(|_| BrowserError::Timeout {
operation: "page.url".to_string(),
duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
})?
.map_err(|e| BrowserError::CdpError {
operation: "page.url".to_string(),
message: e.to_string(),
})
.map(Option::unwrap_or_default)
}
pub fn status_code(&self) -> Result<Option<u16>> {
let code = self.last_status_code.load(Ordering::Acquire);
Ok(if code == 0 { None } else { Some(code) })
}
pub async fn title(&self) -> Result<String> {
timeout(self.cdp_timeout, self.page.get_title())
.await
.map_err(|_| BrowserError::Timeout {
operation: "get_title".to_string(),
duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
})?
.map_err(|e| BrowserError::ScriptExecutionFailed {
script: "document.title".to_string(),
reason: e.to_string(),
})
.map(Option::unwrap_or_default)
}
pub async fn content(&self) -> Result<String> {
timeout(self.cdp_timeout, self.page.content())
.await
.map_err(|_| BrowserError::Timeout {
operation: "page.content".to_string(),
duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
})?
.map_err(|e| BrowserError::ScriptExecutionFailed {
script: "document.documentElement.outerHTML".to_string(),
reason: e.to_string(),
})
}
pub async fn query_selector_all(&self, selector: &str) -> Result<Vec<NodeHandle>> {
let elements = timeout(self.cdp_timeout, self.page.find_elements(selector))
.await
.map_err(|_| BrowserError::Timeout {
operation: "PageHandle::query_selector_all".to_string(),
duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
})?
.map_err(|e| BrowserError::CdpError {
operation: "PageHandle::query_selector_all".to_string(),
message: e.to_string(),
})?;
let selector_arc: Arc<str> = Arc::from(selector);
Ok(elements
.into_iter()
.map(|el| NodeHandle {
element: el,
selector: selector_arc.clone(),
cdp_timeout: self.cdp_timeout,
page: self.page.clone(),
})
.collect())
}
pub async fn eval<T: serde::de::DeserializeOwned>(&self, script: &str) -> Result<T> {
let script_owned = script.to_string();
timeout(self.cdp_timeout, self.page.evaluate(script))
.await
.map_err(|_| BrowserError::Timeout {
operation: "page.evaluate".to_string(),
duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
})?
.map_err(|e| BrowserError::ScriptExecutionFailed {
script: script_owned.clone(),
reason: e.to_string(),
})?
.into_value::<T>()
.map_err(|e| BrowserError::ScriptExecutionFailed {
script: script_owned,
reason: e.to_string(),
})
}
pub async fn save_cookies(
&self,
) -> Result<Vec<chromiumoxide::cdp::browser_protocol::network::Cookie>> {
use chromiumoxide::cdp::browser_protocol::network::GetCookiesParams;
let url = self
.page
.url()
.await
.map_err(|e| BrowserError::CdpError {
operation: "page.url".to_string(),
message: e.to_string(),
})?
.unwrap_or_default();
timeout(
self.cdp_timeout,
self.page
.execute(GetCookiesParams::builder().urls(vec![url]).build()),
)
.await
.map_err(|_| BrowserError::Timeout {
operation: "Network.getCookies".to_string(),
duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
})?
.map_err(|e| BrowserError::CdpError {
operation: "Network.getCookies".to_string(),
message: e.to_string(),
})
.map(|r| r.cookies.clone())
}
pub async fn inject_cookies(&self, cookies: &[crate::session::SessionCookie]) -> Result<()> {
use chromiumoxide::cdp::browser_protocol::network::SetCookieParams;
for cookie in cookies {
let params = match SetCookieParams::builder()
.name(cookie.name.clone())
.value(cookie.value.clone())
.domain(cookie.domain.clone())
.path(cookie.path.clone())
.http_only(cookie.http_only)
.secure(cookie.secure)
.build()
{
Ok(p) => p,
Err(e) => {
warn!(cookie = %cookie.name, error = %e, "Failed to build cookie params");
continue;
}
};
match timeout(self.cdp_timeout, self.page.execute(params)).await {
Err(_) => {
warn!(
cookie = %cookie.name,
timeout_ms = self.cdp_timeout.as_millis(),
"Timed out injecting cookie"
);
}
Ok(Err(e)) => {
warn!(cookie = %cookie.name, error = %e, "Failed to inject cookie");
}
Ok(Ok(_)) => {}
}
}
debug!(count = cookies.len(), "Cookies injected");
Ok(())
}
pub async fn screenshot(&self) -> Result<Vec<u8>> {
use chromiumoxide::page::ScreenshotParams;
let params = ScreenshotParams::builder().full_page(true).build();
timeout(self.cdp_timeout, self.page.screenshot(params))
.await
.map_err(|_| BrowserError::Timeout {
operation: "Page.captureScreenshot".to_string(),
duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
})?
.map_err(|e| BrowserError::CdpError {
operation: "Page.captureScreenshot".to_string(),
message: e.to_string(),
})
}
pub const fn inner(&self) -> &Page {
&self.page
}
pub async fn close(self) -> Result<()> {
timeout(Duration::from_secs(5), self.page.clone().close())
.await
.map_err(|_| BrowserError::Timeout {
operation: "page.close".to_string(),
duration_ms: 5000,
})?
.map_err(|e| BrowserError::CdpError {
operation: "page.close".to_string(),
message: e.to_string(),
})
}
}
#[cfg(feature = "stealth")]
impl PageHandle {
pub async fn verify_stealth(&self) -> Result<crate::diagnostic::DiagnosticReport> {
use crate::diagnostic::{CheckResult, DiagnosticReport, all_checks};
let mut results: Vec<CheckResult> = Vec::new();
for check in all_checks() {
let result = match self.eval::<String>(check.script).await {
Ok(json) => check.parse_output(&json),
Err(e) => {
tracing::warn!(
check = ?check.id,
error = %e,
"stealth check script failed during evaluation"
);
CheckResult {
id: check.id,
description: check.description.to_string(),
passed: false,
details: format!("script error: {e}"),
}
}
};
tracing::debug!(
check = ?result.id,
passed = result.passed,
details = %result.details,
"stealth check result"
);
results.push(result);
}
Ok(DiagnosticReport::new(results))
}
pub async fn verify_stealth_with_transport(
&self,
observed: Option<crate::diagnostic::TransportObservations>,
) -> Result<crate::diagnostic::DiagnosticReport> {
let report = self.verify_stealth().await?;
let user_agent = match self.eval::<String>("navigator.userAgent").await {
Ok(ua) => ua,
Err(e) => {
tracing::warn!(error = %e, "failed to read navigator.userAgent for transport diagnostics");
String::new()
}
};
let transport = crate::diagnostic::TransportDiagnostic::from_user_agent_and_observations(
&user_agent,
observed.as_ref(),
);
Ok(report.with_transport(transport))
}
}
#[cfg(feature = "extract")]
impl PageHandle {
pub async fn extract_all<T>(&self, selector: &str) -> Result<Vec<T>>
where
T: crate::extract::Extractable,
{
use futures::future::try_join_all;
let nodes = self.query_selector_all(selector).await?;
try_join_all(nodes.iter().map(|n| T::extract_from(n)))
.await
.map_err(BrowserError::ExtractionFailed)
}
}
#[cfg(feature = "similarity")]
impl NodeHandle {
pub async fn fingerprint(&self) -> Result<crate::similarity::ElementFingerprint> {
const JS: &str = r"function() {
var el = this;
var tag = el.tagName.toLowerCase();
var classes = Array.prototype.slice.call(el.classList).sort();
var attrNames = Array.prototype.slice.call(el.attributes)
.map(function(a) { return a.name; })
.filter(function(n) { return n !== 'class' && n !== 'id'; })
.sort();
var depth = 0;
var n = el.parentElement;
while (n && n.tagName.toLowerCase() !== 'body') { depth++; n = n.parentElement; }
return JSON.stringify({ tag: tag, classes: classes, attrNames: attrNames, depth: depth });
}";
let returns = tokio::time::timeout(self.cdp_timeout, self.element.call_js_fn(JS, true))
.await
.map_err(|_| BrowserError::Timeout {
operation: "NodeHandle::fingerprint".to_string(),
duration_ms: u64::try_from(self.cdp_timeout.as_millis()).unwrap_or(u64::MAX),
})?
.map_err(|e| self.cdp_err_or_stale(&e, "fingerprint"))?;
let json_str = returns
.result
.value
.as_ref()
.and_then(|v| v.as_str())
.ok_or_else(|| BrowserError::ScriptExecutionFailed {
script: "NodeHandle::fingerprint".to_string(),
reason: "CDP returned no string value from fingerprint script".to_string(),
})?;
serde_json::from_str::<crate::similarity::ElementFingerprint>(json_str).map_err(|e| {
BrowserError::ScriptExecutionFailed {
script: "NodeHandle::fingerprint".to_string(),
reason: format!("failed to deserialise fingerprint JSON: {e}"),
}
})
}
}
#[cfg(feature = "similarity")]
impl PageHandle {
pub async fn find_similar(
&self,
reference: &NodeHandle,
config: crate::similarity::SimilarityConfig,
) -> Result<Vec<crate::similarity::SimilarMatch>> {
use crate::similarity::{SimilarMatch, jaccard_weighted};
let ref_fp = reference.fingerprint().await?;
let candidates = self.query_selector_all("*").await?;
let mut matches: Vec<SimilarMatch> = Vec::new();
for node in candidates {
if let Ok(cand_fp) = node.fingerprint().await {
let score = jaccard_weighted(&ref_fp, &cand_fp);
if score >= config.threshold {
matches.push(SimilarMatch { node, score });
}
}
}
matches.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
if config.max_results > 0 {
matches.truncate(config.max_results);
}
Ok(matches)
}
}
impl Drop for PageHandle {
fn drop(&mut self) {
warn!("PageHandle dropped without explicit close(); spawning cleanup task");
let page = self.page.clone();
tokio::spawn(async move {
let _ = page.close().await;
});
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn resource_filter_block_media_blocks_image() {
let filter = ResourceFilter::block_media();
assert!(filter.should_block("Image"));
assert!(filter.should_block("Font"));
assert!(filter.should_block("Stylesheet"));
assert!(filter.should_block("Media"));
assert!(!filter.should_block("Script"));
assert!(!filter.should_block("XHR"));
}
#[test]
fn resource_filter_case_insensitive() {
let filter = ResourceFilter::block_images_and_fonts();
assert!(filter.should_block("image")); assert!(filter.should_block("IMAGE")); assert!(!filter.should_block("Stylesheet"));
}
#[test]
fn resource_filter_builder_chain() {
let filter = ResourceFilter::default()
.block(ResourceType::Image)
.block(ResourceType::Font);
assert!(filter.should_block("Image"));
assert!(filter.should_block("Font"));
assert!(!filter.should_block("Stylesheet"));
}
#[test]
fn resource_filter_dedup_block() {
let filter = ResourceFilter::default()
.block(ResourceType::Image)
.block(ResourceType::Image); assert_eq!(filter.blocked.len(), 1);
}
#[test]
fn resource_filter_is_empty_when_default() {
assert!(ResourceFilter::default().is_empty());
assert!(!ResourceFilter::block_media().is_empty());
}
#[test]
fn wait_until_selector_stores_string() {
let w = WaitUntil::Selector("#foo".to_string());
assert!(matches!(w, WaitUntil::Selector(ref s) if s == "#foo"));
}
#[test]
fn resource_type_cdp_str() {
assert_eq!(ResourceType::Image.as_cdp_str(), "Image");
assert_eq!(ResourceType::Font.as_cdp_str(), "Font");
assert_eq!(ResourceType::Stylesheet.as_cdp_str(), "Stylesheet");
assert_eq!(ResourceType::Media.as_cdp_str(), "Media");
}
#[test]
fn page_handle_is_send_sync() {
fn assert_send<T: Send>() {}
fn assert_sync<T: Sync>() {}
assert_send::<PageHandle>();
assert_sync::<PageHandle>();
}
#[test]
fn status_code_sentinel_zero_maps_to_none() {
use std::sync::atomic::{AtomicU16, Ordering};
let atom = AtomicU16::new(0);
let code = atom.load(Ordering::Acquire);
assert_eq!(if code == 0 { None } else { Some(code) }, None::<u16>);
}
#[test]
fn status_code_non_zero_maps_to_some() {
use std::sync::atomic::{AtomicU16, Ordering};
for &expected in &[200u16, 301, 404, 503] {
let atom = AtomicU16::new(expected);
let code = atom.load(Ordering::Acquire);
assert_eq!(if code == 0 { None } else { Some(code) }, Some(expected));
}
}
#[test]
fn attr_map_chunking_pairs_correctly() {
let flat = [
"id".to_string(),
"main".to_string(),
"data-ux".to_string(),
"Section".to_string(),
"class".to_string(),
"container".to_string(),
];
let mut map = std::collections::HashMap::with_capacity(flat.len() / 2);
for pair in flat.chunks_exact(2) {
if let [name, value] = pair {
map.insert(name.clone(), value.clone());
}
}
assert_eq!(map.get("id").map(String::as_str), Some("main"));
assert_eq!(map.get("data-ux").map(String::as_str), Some("Section"));
assert_eq!(map.get("class").map(String::as_str), Some("container"));
assert_eq!(map.len(), 3);
}
#[test]
fn attr_map_chunking_ignores_odd_trailing() {
let flat = ["orphan".to_string()]; let mut map = std::collections::HashMap::new();
for pair in flat.chunks_exact(2) {
if let [name, value] = pair {
map.insert(name.clone(), value.clone());
}
}
assert!(map.is_empty());
}
#[test]
fn attr_map_chunking_empty_input() {
let flat: Vec<String> = vec![];
let map: std::collections::HashMap<String, String> = flat
.chunks_exact(2)
.filter_map(|pair| {
if let [name, value] = pair {
Some((name.clone(), value.clone()))
} else {
None
}
})
.collect();
assert!(map.is_empty());
}
#[test]
fn ancestors_json_parse_round_trip() -> std::result::Result<(), serde_json::Error> {
let json = r#"["p","article","body","html"]"#;
let result: Vec<String> = serde_json::from_str(json)?;
assert_eq!(result, ["p", "article", "body", "html"]);
Ok(())
}
#[test]
fn ancestors_json_parse_empty() -> std::result::Result<(), serde_json::Error> {
let json = "[]";
let result: Vec<String> = serde_json::from_str(json)?;
assert!(result.is_empty());
Ok(())
}
#[test]
fn traversal_selector_suffix_in_stale_error() {
let e = crate::error::BrowserError::StaleNode {
selector: "div::parent".to_string(),
};
let msg = e.to_string();
assert!(
msg.contains("div::parent"),
"StaleNode display must include the full selector; got: {msg}"
);
}
#[test]
fn traversal_next_suffix_in_stale_error() {
let e = crate::error::BrowserError::StaleNode {
selector: "li.price::next".to_string(),
};
assert!(e.to_string().contains("li.price::next"));
}
#[test]
fn traversal_prev_suffix_in_stale_error() {
let e = crate::error::BrowserError::StaleNode {
selector: "td.label::prev".to_string(),
};
assert!(e.to_string().contains("td.label::prev"));
}
}