use async_trait::async_trait;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::fmt;
use thiserror::Error;
#[derive(Error, Debug)]
pub enum WebAdapterError {
#[error("Navigation failed: {message}")]
NavigationFailed { message: String },
#[error("Content extraction failed: {message}")]
ExtractionFailed { message: String },
#[error("Capture failed: {format:?} - {message}")]
CaptureFailed {
format: CaptureFormat,
message: String,
},
#[error("Navigation timeout after {0}ms")]
NavigationTimeout(u64),
#[error("Invalid URL: {0}")]
InvalidUrl(String),
#[error("Adapter not connected to web service")]
NotConnected,
#[error("Connection to web service lost")]
ConnectionLost,
#[error("Invalid CSS selector: {0}")]
InvalidSelector(String),
#[error("JavaScript execution failed: {message}")]
JavaScriptError { message: String },
#[error("Capture format not supported: {0:?}")]
UnsupportedFormat(CaptureFormat),
#[error("Resource not found: {0}")]
NotFound(String),
#[error("Network error: {0}")]
Network(String),
#[error("Serialization error: {0}")]
Serialization(String),
#[error("{0}")]
Generic(String),
}
pub type WebAdapterResult<T> = std::result::Result<T, WebAdapterError>;
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct PageHandle {
pub id: String,
pub url: String,
pub title: String,
pub is_active: bool,
}
impl fmt::Display for PageHandle {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "Page({}: {})", self.id, self.url)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NavigateOptions {
pub timeout_ms: u64,
pub wait_until: NavigateWaitEvent,
pub inject_js: Option<String>,
pub headers: HashMap<String, String>,
pub user_agent: Option<String>,
pub viewport: Option<(u32, u32)>,
pub follow_redirects: bool,
}
impl Default for NavigateOptions {
fn default() -> Self {
Self {
timeout_ms: 30000,
wait_until: NavigateWaitEvent::Load,
inject_js: None,
headers: HashMap::new(),
user_agent: None,
viewport: None,
follow_redirects: true,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum NavigateWaitEvent {
#[default]
Load,
DomContentLoaded,
NetworkIdle,
}
impl fmt::Display for NavigateWaitEvent {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Load => write!(f, "load"),
Self::DomContentLoaded => write!(f, "domcontentloaded"),
Self::NetworkIdle => write!(f, "networkidle"),
}
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct ExtractedContent {
pub text: String,
pub html: Option<String>,
pub links: Vec<ExtractedLink>,
pub images: Vec<ExtractedImage>,
pub metadata: ContentMetadata,
pub structured_data: Option<serde_json::Value>,
pub language: Option<String>,
pub confidence: f32,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct ExtractedLink {
pub text: String,
pub href: String,
pub title: Option<String>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct ExtractedImage {
pub src: String,
pub alt: Option<String>,
pub title: Option<String>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct ContentMetadata {
pub title: Option<String>,
pub description: Option<String>,
pub og_image: Option<String>,
pub og_title: Option<String>,
pub content_type: Option<String>,
pub charset: Option<String>,
pub author: Option<String>,
pub publish_date: Option<String>,
pub custom_meta: HashMap<String, String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExtractOptions {
pub content_selector: Option<String>,
pub extract_links: bool,
pub extract_images: bool,
pub extract_structured_data: bool,
pub remove_scripts: bool,
pub min_text_length: usize,
pub detect_language: bool,
pub custom_js: Option<String>,
}
impl Default for ExtractOptions {
fn default() -> Self {
Self {
content_selector: None,
extract_links: true,
extract_images: false,
extract_structured_data: false,
remove_scripts: true,
min_text_length: 20,
detect_language: false,
custom_js: None,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize, Hash)]
#[serde(rename_all = "lowercase")]
pub enum CaptureFormat {
#[default]
Png,
Jpeg,
Pdf,
Mhtml,
Html,
Webp,
}
impl fmt::Display for CaptureFormat {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Png => write!(f, "png"),
Self::Jpeg => write!(f, "jpeg"),
Self::Pdf => write!(f, "pdf"),
Self::Mhtml => write!(f, "mhtml"),
Self::Html => write!(f, "html"),
Self::Webp => write!(f, "webp"),
}
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct CapturedPage {
pub format: CaptureFormat,
pub data: Vec<u8>,
pub mime_type: String,
pub size_bytes: usize,
pub metadata: CaptureMetadata,
}
impl CapturedPage {
pub fn as_string(&self) -> WebAdapterResult<String> {
String::from_utf8(self.data.clone())
.map_err(|e| WebAdapterError::Serialization(e.to_string()))
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct CaptureMetadata {
pub url: String,
pub title: Option<String>,
pub viewport_width: u32,
pub viewport_height: u32,
pub full_page: bool,
pub device_scale_factor: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CaptureOptions {
pub format: CaptureFormat,
pub full_page: bool,
pub timeout_ms: u64,
pub quality: Option<u8>,
pub omit_background: bool,
pub device_scale_factor: Option<f32>,
pub delay_ms: u64,
pub execute_js: Option<String>,
}
impl Default for CaptureOptions {
fn default() -> Self {
Self {
format: CaptureFormat::Png,
full_page: true,
timeout_ms: 10000,
quality: Some(80),
omit_background: false,
device_scale_factor: None,
delay_ms: 0,
execute_js: None,
}
}
}
impl CaptureOptions {
pub fn format(mut self, format: CaptureFormat) -> Self {
self.format = format;
self
}
pub fn full_page(mut self, full: bool) -> Self {
self.full_page = full;
self
}
pub fn quality(mut self, quality: u8) -> Self {
self.quality = Some(quality.min(100));
self
}
pub fn timeout_ms(mut self, timeout: u64) -> Self {
self.timeout_ms = timeout;
self
}
}
#[async_trait]
pub trait WebBrowserAdapter: Send + Sync {
async fn connect(&mut self) -> WebAdapterResult<()>;
async fn disconnect(&mut self) -> WebAdapterResult<()>;
fn is_connected(&self) -> bool;
async fn navigate(
&mut self,
url: &str,
options: NavigateOptions,
) -> WebAdapterResult<PageHandle>;
async fn go_back(&mut self) -> WebAdapterResult<PageHandle>;
async fn go_forward(&mut self) -> WebAdapterResult<PageHandle>;
async fn reload(&mut self) -> WebAdapterResult<PageHandle>;
async fn extract_content(
&mut self,
page: &PageHandle,
options: ExtractOptions,
) -> WebAdapterResult<ExtractedContent>;
async fn execute_js(
&mut self,
page: &PageHandle,
script: &str,
) -> WebAdapterResult<serde_json::Value>;
async fn get_text(&mut self, page: &PageHandle, selector: &str) -> WebAdapterResult<String>;
async fn capture_screenshot(
&mut self,
page: &PageHandle,
options: CaptureOptions,
) -> WebAdapterResult<CapturedPage>;
fn diagnostics(&self) -> serde_json::Value;
fn name(&self) -> &str;
fn version(&self) -> &str;
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_navigate_options_default() {
let opts = NavigateOptions::default();
assert_eq!(opts.timeout_ms, 30000);
assert_eq!(opts.wait_until, NavigateWaitEvent::Load);
assert!(opts.follow_redirects);
}
#[test]
fn test_capture_options_builder() {
let opts = CaptureOptions::default()
.format(CaptureFormat::Jpeg)
.quality(90)
.full_page(false);
assert_eq!(opts.format, CaptureFormat::Jpeg);
assert_eq!(opts.quality, Some(90));
assert!(!opts.full_page);
}
#[test]
fn test_capture_format_display() {
assert_eq!(CaptureFormat::Png.to_string(), "png");
assert_eq!(CaptureFormat::Jpeg.to_string(), "jpeg");
assert_eq!(CaptureFormat::Pdf.to_string(), "pdf");
}
#[test]
fn test_page_handle_display() {
let page = PageHandle {
id: "page-1".to_string(),
url: "https://example.com".to_string(),
title: "Example".to_string(),
is_active: true,
};
assert_eq!(page.to_string(), "Page(page-1: https://example.com)");
}
#[test]
fn test_extract_options_default() {
let opts = ExtractOptions::default();
assert!(opts.extract_links);
assert!(!opts.extract_images);
assert!(!opts.extract_structured_data);
assert!(opts.remove_scripts);
}
#[test]
fn test_content_metadata_default() {
let meta = ContentMetadata::default();
assert!(meta.title.is_none());
assert!(meta.custom_meta.is_empty());
}
#[test]
fn test_navigate_wait_event_display() {
assert_eq!(NavigateWaitEvent::Load.to_string(), "load");
assert_eq!(
NavigateWaitEvent::DomContentLoaded.to_string(),
"domcontentloaded"
);
assert_eq!(NavigateWaitEvent::NetworkIdle.to_string(), "networkidle");
}
#[test]
fn test_quality_clamping() {
let opts = CaptureOptions::default().quality(150);
assert_eq!(opts.quality, Some(100));
}
#[test]
fn test_capture_page_as_string() {
let page = CapturedPage {
format: CaptureFormat::Html,
data: "<html>test</html>".as_bytes().to_vec(),
mime_type: "text/html".to_string(),
size_bytes: 17,
metadata: CaptureMetadata {
url: "https://example.com".to_string(),
title: None,
viewport_width: 1024,
viewport_height: 768,
full_page: false,
device_scale_factor: 1.0,
},
};
assert!(page.as_string().is_ok());
assert_eq!(page.as_string().unwrap(), "<html>test</html>");
}
}