car-browser 0.12.0

Browser automation and perception pipeline for Common Agent Runtime
Documentation
//! Browser backend trait — the abstract interface that all browser implementations must satisfy.
//!
//! This maps 1:1 to human-equivalent perception and input (Manifesto Principles 3-5).

use async_trait::async_trait;
use thiserror::Error;

use crate::models::{A11yNode, CookieParam, Modifier, Viewport, WaitCondition};

/// Errors that can occur in browser operations.
#[derive(Error, Debug)]
pub enum BrowserError {
    #[error("Screenshot capture failed: {0}")]
    ScreenshotFailed(String),

    #[error("Accessibility tree extraction failed: {0}")]
    AccessibilityFailed(String),

    #[error("Navigation failed: {0}")]
    NavigationFailed(String),

    #[error("Input injection failed: {0}")]
    InputFailed(String),

    #[error("Element not found: {0}")]
    ElementNotFound(String),

    #[error("Platform internal error: {0}")]
    PlatformInternal(String),

    #[error("Wait condition timed out")]
    Timeout,

    #[error("Browser not available: {0}")]
    NotAvailable(String),

    #[error("Not supported: {0}")]
    Unsupported(String),
}

/// Abstract browser backend trait.
///
/// Implementations drive a real browser (Tauri WebView, headless Chromium, etc.)
/// through human-equivalent perception and input only.
///
/// # Perception (what the AI can see)
/// - Screenshots: rendered pixels
/// - Accessibility tree: semantic structure exposed to assistive technologies
///
/// # Actions (what the AI can do)
/// - Click, type, scroll, keypress — all map 1:1 to human input
/// - Navigation — equivalent to typing a URL
///
/// # Disallowed
/// - DOM traversal, JS execution for data extraction, hidden attributes
/// - Network traffic inspection, cookie/storage introspection
#[async_trait]
pub trait BrowserBackend: Send + Sync {
    // =========================================================================
    // Perception
    // =========================================================================

    /// Capture a screenshot of the current page as PNG data.
    async fn capture_screenshot(&self) -> Result<Vec<u8>, BrowserError>;

    /// Extract the accessibility tree from the current page.
    async fn get_accessibility_tree(&self) -> Result<Vec<A11yNode>, BrowserError>;

    /// Get the current viewport dimensions.
    fn get_viewport(&self) -> Result<Viewport, BrowserError>;

    /// Get the current page URL.
    fn get_current_url(&self) -> Result<String, BrowserError>;

    /// Get the current page title.
    async fn get_page_title(&self) -> Result<String, BrowserError>;

    // =========================================================================
    // Navigation
    // =========================================================================

    /// Navigate to a URL.
    async fn navigate(&self, url: &str) -> Result<(), BrowserError>;

    // =========================================================================
    // Human-equivalent input (Manifesto Principle 5)
    // =========================================================================

    /// Click at viewport coordinates.
    async fn inject_click(&self, x: f64, y: f64) -> Result<(), BrowserError>;

    /// Type text into the focused element.
    async fn inject_text(&self, text: &str) -> Result<(), BrowserError>;

    /// Press a key with optional modifiers.
    async fn inject_keypress(&self, key: &str, modifiers: &[Modifier]) -> Result<(), BrowserError>;

    /// Scroll the page.
    async fn inject_scroll(&self, delta_y: i32) -> Result<(), BrowserError>;

    // =========================================================================
    // Accessibility actions (VoiceOver-equivalent)
    // =========================================================================

    /// Click an element by accessibility node ID (AXPress).
    async fn click_element(&self, node_id: &str) -> Result<(), BrowserError>;

    /// Type text into an element by accessibility node ID.
    async fn type_into_element(&self, node_id: &str, text: &str) -> Result<(), BrowserError>;

    /// Focus an element by accessibility node ID.
    async fn focus_element(&self, node_id: &str) -> Result<(), BrowserError>;

    // =========================================================================
    // Wait conditions
    // =========================================================================

    /// Check if page is fully loaded.
    async fn is_page_loaded(&self) -> Result<bool, BrowserError>;

    /// Wait for a condition to be met.
    async fn wait_until(
        &self,
        condition: &WaitCondition,
        timeout_ms: u64,
    ) -> Result<bool, BrowserError>;

    /// Check if an element matching a description exists in the accessibility tree.
    async fn element_exists_a11y(
        &self,
        name_contains: &str,
        role: Option<&str>,
    ) -> Result<bool, BrowserError>;

    // =========================================================================
    // Auth state injection (pre-navigation)
    // =========================================================================

    /// Inject cookies into the browser. Must be called before navigation
    /// for the cookies to be sent with the first request.
    async fn set_cookies(&self, _cookies: &[CookieParam]) -> Result<(), BrowserError> {
        Err(BrowserError::Unsupported(
            "set_cookies not implemented".into(),
        ))
    }

    /// Set localStorage items for a given origin.
    /// The browser will briefly navigate to the origin to set the items.
    async fn set_local_storage(
        &self,
        _origin: &str,
        _items: &[(String, String)],
    ) -> Result<(), BrowserError> {
        Err(BrowserError::Unsupported(
            "set_local_storage not implemented".into(),
        ))
    }

    /// Set extra HTTP headers to include on every request.
    async fn set_extra_headers(&self, _headers: &[(String, String)]) -> Result<(), BrowserError> {
        Err(BrowserError::Unsupported(
            "set_extra_headers not implemented".into(),
        ))
    }

    // =========================================================================
    // Lifecycle
    // =========================================================================

    /// Shut down the browser backend and release resources.
    ///
    /// For headless Chromium: terminates the browser process.
    /// For Tauri: WebView cleanup.
    /// Default: no-op.
    async fn shutdown(&self) -> Result<(), BrowserError> {
        Ok(())
    }
}