car-browser 0.12.0

Browser automation and perception pipeline for Common Agent Runtime
Documentation
//! Browser tool registration and execution for car-engine.
//!
//! Registers `browse_*` tools with the CAR runtime and dispatches tool calls
//! to the appropriate `BrowserBackend` methods.

use std::sync::Arc;

use async_trait::async_trait;
use car_engine::ToolExecutor;
use car_ir::ToolSchema;
use serde_json::{json, Value};
use tokio::sync::RwLock;

use crate::backend::BrowserBackend;
use crate::models::{Modifier, WaitCondition};
use crate::perception::pipeline::PerceptionPipeline;
use crate::perception::ui_map::UiMap;

/// Tool executor that dispatches `browse_*` tool calls to a `BrowserBackend`.
///
/// Maintains a reference to the last UiMap from `browse_observe` to resolve
/// `el_N` IDs to AX node IDs before passing to the backend.
pub struct BrowserToolExecutor {
    backend: Arc<dyn BrowserBackend>,
    pipeline: Arc<dyn PerceptionPipeline>,
    /// Last observed UiMap — used to resolve el_N → ax_ref for click/type/focus.
    last_ui_map: Arc<RwLock<Option<UiMap>>>,
}

impl BrowserToolExecutor {
    /// Create a new browser tool executor.
    pub fn new(backend: Arc<dyn BrowserBackend>, pipeline: Arc<dyn PerceptionPipeline>) -> Self {
        Self {
            backend,
            pipeline,
            last_ui_map: Arc::new(RwLock::new(None)),
        }
    }

    /// Resolve an element ID (el_N) to the backend's AX node ID.
    ///
    /// Falls back to the raw ID if no UiMap is available or element not found.
    async fn resolve_element_id(&self, element_id: &str) -> String {
        let guard = self.last_ui_map.read().await;
        if let Some(ui_map) = guard.as_ref() {
            if let Some(element) = ui_map.get_element(element_id) {
                if let Some(ref ax_ref) = element.ax_ref {
                    return ax_ref.clone();
                }
            }
        }
        // Fallback: pass through as-is (may be a raw AX node ID already)
        element_id.to_string()
    }

    /// Get the tool schemas for all browser tools.
    pub fn tool_schemas() -> Vec<ToolSchema> {
        vec![
            ToolSchema {
                name: "browse_navigate".to_string(),
                description: "Navigate the browser to a URL".to_string(),
                parameters: json!({
                    "type": "object",
                    "properties": {
                        "url": { "type": "string", "description": "URL to navigate to" }
                    },
                    "required": ["url"]
                }),
                returns: Some(json!({"type": "object", "properties": {"url": {"type": "string"}}})),
                idempotent: false,
                cache_ttl_secs: None,
                rate_limit: None,
            },
            ToolSchema {
                name: "browse_click".to_string(),
                description: "Click on a UI element by accessibility node ID".to_string(),
                parameters: json!({
                    "type": "object",
                    "properties": {
                        "element_id": { "type": "string", "description": "Accessibility node ID (e.g. 'el_5')" }
                    },
                    "required": ["element_id"]
                }),
                returns: Some(json!({"type": "object"})),
                idempotent: false,
                cache_ttl_secs: None,
                rate_limit: None,
            },
            ToolSchema {
                name: "browse_type".to_string(),
                description: "Type text into a UI element by accessibility node ID".to_string(),
                parameters: json!({
                    "type": "object",
                    "properties": {
                        "element_id": { "type": "string", "description": "Accessibility node ID of a text field" },
                        "text": { "type": "string", "description": "Text to enter" }
                    },
                    "required": ["element_id", "text"]
                }),
                returns: Some(json!({"type": "object"})),
                idempotent: false,
                cache_ttl_secs: None,
                rate_limit: None,
            },
            ToolSchema {
                name: "browse_scroll".to_string(),
                description: "Scroll the browser page".to_string(),
                parameters: json!({
                    "type": "object",
                    "properties": {
                        "delta_y": { "type": "integer", "description": "Scroll amount (positive = down, negative = up)" }
                    },
                    "required": ["delta_y"]
                }),
                returns: Some(json!({"type": "object"})),
                idempotent: false,
                cache_ttl_secs: None,
                rate_limit: None,
            },
            ToolSchema {
                name: "browse_keypress".to_string(),
                description: "Press a key with optional modifiers".to_string(),
                parameters: json!({
                    "type": "object",
                    "properties": {
                        "key": { "type": "string", "description": "Key to press (e.g. 'Enter', 'a', 'Tab')" },
                        "modifiers": {
                            "type": "array",
                            "items": { "type": "string", "enum": ["shift", "control", "alt", "meta"] },
                            "description": "Optional modifier keys to hold during keypress"
                        }
                    },
                    "required": ["key"]
                }),
                returns: Some(json!({"type": "object", "properties": {"key": {"type": "string"}, "status": {"type": "string"}}})),
                idempotent: false,
                cache_ttl_secs: None,
                rate_limit: None,
            },
            ToolSchema {
                name: "browse_wait".to_string(),
                description: "Wait for a browser condition to be met".to_string(),
                parameters: json!({
                    "type": "object",
                    "properties": {
                        "condition": { "type": "string", "description": "Condition to wait for: 'page_loaded' or 'url_changed'" },
                        "timeout_ms": { "type": "number", "description": "Timeout in milliseconds (default: 5000)" }
                    },
                    "required": ["condition"]
                }),
                returns: Some(json!({"type": "object", "properties": {"condition": {"type": "string"}, "met": {"type": "boolean"}}})),
                idempotent: true,
                cache_ttl_secs: None,
                rate_limit: None,
            },
            ToolSchema {
                name: "browse_observe".to_string(),
                description: "Observe the current browser state: take screenshot, extract accessibility tree, produce UiMap".to_string(),
                parameters: json!({
                    "type": "object",
                    "properties": {
                        "include_screenshot": { "type": "boolean", "description": "Include base64 screenshot inline (default: false, returns file path instead)" }
                    }
                }),
                returns: Some(json!({
                    "type": "object",
                    "properties": {
                        "url": {"type": "string"},
                        "title": {"type": "string"},
                        "ui_map": {"type": "string"},
                        "screenshot_path": {"type": "string"},
                        "screenshot_base64": {"type": "string", "description": "Only present if include_screenshot=true"}
                    }
                })),
                idempotent: true,
                cache_ttl_secs: None,
                rate_limit: None,
            },
        ]
    }

    async fn handle_navigate(&self, params: &Value) -> Result<Value, String> {
        let url = params
            .get("url")
            .and_then(|v| v.as_str())
            .ok_or("Missing required parameter: url")?;
        self.backend
            .navigate(url)
            .await
            .map_err(|e| e.to_string())?;
        Ok(json!({"url": url, "status": "navigated"}))
    }

    async fn handle_click(&self, params: &Value) -> Result<Value, String> {
        let element_id = params
            .get("element_id")
            .and_then(|v| v.as_str())
            .ok_or("Missing required parameter: element_id")?;
        let resolved_id = self.resolve_element_id(element_id).await;
        self.backend
            .click_element(&resolved_id)
            .await
            .map_err(|e| e.to_string())?;
        Ok(json!({"element_id": element_id, "resolved_id": resolved_id, "status": "clicked"}))
    }

    async fn handle_type(&self, params: &Value) -> Result<Value, String> {
        let element_id = params
            .get("element_id")
            .and_then(|v| v.as_str())
            .ok_or("Missing required parameter: element_id")?;
        let text = params
            .get("text")
            .and_then(|v| v.as_str())
            .ok_or("Missing required parameter: text")?;
        let resolved_id = self.resolve_element_id(element_id).await;
        self.backend
            .type_into_element(&resolved_id, text)
            .await
            .map_err(|e| e.to_string())?;
        Ok(
            json!({"element_id": element_id, "resolved_id": resolved_id, "text": text, "status": "typed"}),
        )
    }

    async fn handle_scroll(&self, params: &Value) -> Result<Value, String> {
        let delta_y = params
            .get("delta_y")
            .and_then(|v| v.as_i64())
            .ok_or("Missing required parameter: delta_y")? as i32;
        self.backend
            .inject_scroll(delta_y)
            .await
            .map_err(|e| e.to_string())?;
        Ok(json!({"delta_y": delta_y, "status": "scrolled"}))
    }

    async fn handle_keypress(&self, params: &Value) -> Result<Value, String> {
        let key = params
            .get("key")
            .and_then(|v| v.as_str())
            .ok_or("Missing required parameter: key")?;
        let modifiers: Vec<Modifier> = params
            .get("modifiers")
            .and_then(|v| v.as_array())
            .map(|arr| {
                arr.iter()
                    .filter_map(|m| match m.as_str()? {
                        "shift" => Some(Modifier::Shift),
                        "control" => Some(Modifier::Control),
                        "alt" => Some(Modifier::Alt),
                        "meta" => Some(Modifier::Meta),
                        _ => None,
                    })
                    .collect()
            })
            .unwrap_or_default();
        self.backend
            .inject_keypress(key, &modifiers)
            .await
            .map_err(|e| e.to_string())?;
        Ok(json!({"key": key, "status": "pressed"}))
    }

    async fn handle_wait(&self, params: &Value) -> Result<Value, String> {
        let condition_str = params
            .get("condition")
            .and_then(|v| v.as_str())
            .ok_or("Missing required parameter: condition")?;
        let timeout_ms = params
            .get("timeout_ms")
            .and_then(|v| v.as_u64())
            .unwrap_or(5000);
        // Accepted shapes:
        //   "page_loaded" | "url_changed"
        //   "a11y_contains_text:<text>"
        //   "element_with_name:<name>"
        //   "element_with_name:<name>@<role>"
        let condition = match condition_str {
            "page_loaded" => WaitCondition::PageLoaded,
            "url_changed" => WaitCondition::UrlChanged,
            s if s.starts_with("a11y_contains_text:") => WaitCondition::A11yContainsText {
                text: s["a11y_contains_text:".len()..].to_string(),
            },
            s if s.starts_with("element_with_name:") => {
                let rest = &s["element_with_name:".len()..];
                let (name_contains, role) = match rest.split_once('@') {
                    Some((n, r)) => (n.to_string(), Some(r.to_string())),
                    None => (rest.to_string(), None),
                };
                WaitCondition::ElementWithName {
                    name_contains,
                    role,
                }
            }
            other => return Err(format!("Unknown wait condition: {other}")),
        };
        let met = self
            .backend
            .wait_until(&condition, timeout_ms)
            .await
            .map_err(|e| e.to_string())?;
        Ok(json!({"condition": condition_str, "met": met}))
    }

    async fn handle_observe(&self, _params: &Value) -> Result<Value, String> {
        let screenshot = self
            .backend
            .capture_screenshot()
            .await
            .map_err(|e| e.to_string())?;
        let a11y_nodes = self
            .backend
            .get_accessibility_tree()
            .await
            .map_err(|e| e.to_string())?;
        let url = self.backend.get_current_url().map_err(|e| e.to_string())?;
        let title = self
            .backend
            .get_page_title()
            .await
            .map_err(|e| e.to_string())?;
        let viewport = self.backend.get_viewport().map_err(|e| e.to_string())?;

        let ui_map = self
            .pipeline
            .perceive(&screenshot, &a11y_nodes, &url, viewport)
            .await
            .map_err(|e| e.to_string())?;

        // Store UiMap for element ID resolution in subsequent click/type calls
        {
            let mut guard = self.last_ui_map.write().await;
            *guard = Some(ui_map.clone());
        }

        let ui_map_text = ui_map.format_summary();

        // Save screenshot to temp file instead of inline base64 (saves ~3-5MB per observe)
        let screenshot_path = {
            let dir = std::env::temp_dir().join("car-browser-screenshots");
            let _ = std::fs::create_dir_all(&dir);
            let path = dir.join(format!("{}.png", uuid::Uuid::new_v4()));
            std::fs::write(&path, &screenshot).map_err(|e| e.to_string())?;
            path.to_string_lossy().to_string()
        };

        // Screenshot is always saved to disk — never inline base64 (saves ~80KB per observe).
        // Vision models can read the file via screenshot_path if needed.
        let result = json!({
            "url": url,
            "title": title,
            "ui_map": ui_map_text,
            "screenshot_path": screenshot_path,
            "element_count": ui_map.elements.len(),
            "viewport": {
                "width": viewport.width,
                "height": viewport.height,
            }
        });

        Ok(result)
    }
}

#[async_trait]
impl ToolExecutor for BrowserToolExecutor {
    async fn execute(&self, tool: &str, params: &Value) -> Result<Value, String> {
        match tool {
            "browse_navigate" => self.handle_navigate(params).await,
            "browse_click" => self.handle_click(params).await,
            "browse_type" => self.handle_type(params).await,
            "browse_scroll" => self.handle_scroll(params).await,
            "browse_keypress" => self.handle_keypress(params).await,
            "browse_wait" => self.handle_wait(params).await,
            "browse_observe" => self.handle_observe(params).await,
            _ => Err(format!("Unknown browser tool: {tool}")),
        }
    }
}