Skip to main content

car_browser/
tools.rs

1//! Browser tool registration and execution for car-engine.
2//!
3//! Registers `browse_*` tools with the CAR runtime and dispatches tool calls
4//! to the appropriate `BrowserBackend` methods.
5
6use std::sync::Arc;
7
8use async_trait::async_trait;
9use car_engine::ToolExecutor;
10use car_ir::ToolSchema;
11use serde_json::{json, Value};
12use tokio::sync::RwLock;
13
14use crate::backend::BrowserBackend;
15use crate::perception::pipeline::PerceptionPipeline;
16use crate::perception::ui_map::UiMap;
17
18/// Tool executor that dispatches `browse_*` tool calls to a `BrowserBackend`.
19///
20/// Maintains a reference to the last UiMap from `browse_observe` to resolve
21/// `el_N` IDs to AX node IDs before passing to the backend.
22pub struct BrowserToolExecutor {
23    backend: Arc<dyn BrowserBackend>,
24    pipeline: Arc<dyn PerceptionPipeline>,
25    /// Last observed UiMap — used to resolve el_N → ax_ref for click/type/focus.
26    last_ui_map: Arc<RwLock<Option<UiMap>>>,
27}
28
29impl BrowserToolExecutor {
30    /// Create a new browser tool executor.
31    pub fn new(
32        backend: Arc<dyn BrowserBackend>,
33        pipeline: Arc<dyn PerceptionPipeline>,
34    ) -> Self {
35        Self {
36            backend,
37            pipeline,
38            last_ui_map: Arc::new(RwLock::new(None)),
39        }
40    }
41
42    /// Resolve an element ID (el_N) to the backend's AX node ID.
43    ///
44    /// Falls back to the raw ID if no UiMap is available or element not found.
45    async fn resolve_element_id(&self, element_id: &str) -> String {
46        let guard = self.last_ui_map.read().await;
47        if let Some(ui_map) = guard.as_ref() {
48            if let Some(element) = ui_map.get_element(element_id) {
49                if let Some(ref ax_ref) = element.ax_ref {
50                    return ax_ref.clone();
51                }
52            }
53        }
54        // Fallback: pass through as-is (may be a raw AX node ID already)
55        element_id.to_string()
56    }
57
58    /// Get the tool schemas for all browser tools.
59    pub fn tool_schemas() -> Vec<ToolSchema> {
60        vec![
61            ToolSchema {
62                name: "browse_navigate".to_string(),
63                description: "Navigate the browser to a URL".to_string(),
64                parameters: json!({
65                    "type": "object",
66                    "properties": {
67                        "url": { "type": "string", "description": "URL to navigate to" }
68                    },
69                    "required": ["url"]
70                }),
71                returns: Some(json!({"type": "object", "properties": {"url": {"type": "string"}}})),
72                idempotent: false,
73                cache_ttl_secs: None,
74                rate_limit: None,
75            },
76            ToolSchema {
77                name: "browse_click".to_string(),
78                description: "Click on a UI element by accessibility node ID".to_string(),
79                parameters: json!({
80                    "type": "object",
81                    "properties": {
82                        "element_id": { "type": "string", "description": "Accessibility node ID (e.g. 'el_5')" }
83                    },
84                    "required": ["element_id"]
85                }),
86                returns: Some(json!({"type": "object"})),
87                idempotent: false,
88                cache_ttl_secs: None,
89                rate_limit: None,
90            },
91            ToolSchema {
92                name: "browse_type".to_string(),
93                description: "Type text into a UI element by accessibility node ID".to_string(),
94                parameters: json!({
95                    "type": "object",
96                    "properties": {
97                        "element_id": { "type": "string", "description": "Accessibility node ID of a text field" },
98                        "text": { "type": "string", "description": "Text to enter" }
99                    },
100                    "required": ["element_id", "text"]
101                }),
102                returns: Some(json!({"type": "object"})),
103                idempotent: false,
104                cache_ttl_secs: None,
105                rate_limit: None,
106            },
107            ToolSchema {
108                name: "browse_scroll".to_string(),
109                description: "Scroll the browser page".to_string(),
110                parameters: json!({
111                    "type": "object",
112                    "properties": {
113                        "delta_y": { "type": "integer", "description": "Scroll amount (positive = down, negative = up)" }
114                    },
115                    "required": ["delta_y"]
116                }),
117                returns: Some(json!({"type": "object"})),
118                idempotent: false,
119                cache_ttl_secs: None,
120                rate_limit: None,
121            },
122            ToolSchema {
123                name: "browse_observe".to_string(),
124                description: "Observe the current browser state: take screenshot, extract accessibility tree, produce UiMap".to_string(),
125                parameters: json!({
126                    "type": "object",
127                    "properties": {}
128                }),
129                returns: Some(json!({
130                    "type": "object",
131                    "properties": {
132                        "url": {"type": "string"},
133                        "title": {"type": "string"},
134                        "ui_map": {"type": "string"},
135                        "screenshot_base64": {"type": "string"}
136                    }
137                })),
138                idempotent: true,
139                cache_ttl_secs: None,
140                rate_limit: None,
141            },
142        ]
143    }
144
145    async fn handle_navigate(&self, params: &Value) -> Result<Value, String> {
146        let url = params
147            .get("url")
148            .and_then(|v| v.as_str())
149            .ok_or("Missing required parameter: url")?;
150        self.backend
151            .navigate(url)
152            .await
153            .map_err(|e| e.to_string())?;
154        Ok(json!({"url": url, "status": "navigated"}))
155    }
156
157    async fn handle_click(&self, params: &Value) -> Result<Value, String> {
158        let element_id = params
159            .get("element_id")
160            .and_then(|v| v.as_str())
161            .ok_or("Missing required parameter: element_id")?;
162        let resolved_id = self.resolve_element_id(element_id).await;
163        self.backend
164            .click_element(&resolved_id)
165            .await
166            .map_err(|e| e.to_string())?;
167        Ok(json!({"element_id": element_id, "resolved_id": resolved_id, "status": "clicked"}))
168    }
169
170    async fn handle_type(&self, params: &Value) -> Result<Value, String> {
171        let element_id = params
172            .get("element_id")
173            .and_then(|v| v.as_str())
174            .ok_or("Missing required parameter: element_id")?;
175        let text = params
176            .get("text")
177            .and_then(|v| v.as_str())
178            .ok_or("Missing required parameter: text")?;
179        let resolved_id = self.resolve_element_id(element_id).await;
180        self.backend
181            .type_into_element(&resolved_id, text)
182            .await
183            .map_err(|e| e.to_string())?;
184        Ok(json!({"element_id": element_id, "resolved_id": resolved_id, "text": text, "status": "typed"}))
185    }
186
187    async fn handle_scroll(&self, params: &Value) -> Result<Value, String> {
188        let delta_y = params
189            .get("delta_y")
190            .and_then(|v| v.as_i64())
191            .ok_or("Missing required parameter: delta_y")? as i32;
192        self.backend
193            .inject_scroll(delta_y)
194            .await
195            .map_err(|e| e.to_string())?;
196        Ok(json!({"delta_y": delta_y, "status": "scrolled"}))
197    }
198
199    async fn handle_observe(&self, _params: &Value) -> Result<Value, String> {
200        let screenshot = self
201            .backend
202            .capture_screenshot()
203            .await
204            .map_err(|e| e.to_string())?;
205        let a11y_nodes = self
206            .backend
207            .get_accessibility_tree()
208            .await
209            .map_err(|e| e.to_string())?;
210        let url = self.backend.get_current_url().map_err(|e| e.to_string())?;
211        let title = self
212            .backend
213            .get_page_title()
214            .await
215            .map_err(|e| e.to_string())?;
216        let viewport = self.backend.get_viewport().map_err(|e| e.to_string())?;
217
218        let ui_map = self
219            .pipeline
220            .perceive(&screenshot, &a11y_nodes, &url, viewport)
221            .await
222            .map_err(|e| e.to_string())?;
223
224        // Store UiMap for element ID resolution in subsequent click/type calls
225        {
226            let mut guard = self.last_ui_map.write().await;
227            *guard = Some(ui_map.clone());
228        }
229
230        let screenshot_b64 = base64::Engine::encode(
231            &base64::engine::general_purpose::STANDARD,
232            &screenshot,
233        );
234
235        let ui_map_text = ui_map.format_compact();
236
237        Ok(json!({
238            "url": url,
239            "title": title,
240            "ui_map": ui_map_text,
241            "screenshot_base64": screenshot_b64,
242            "element_count": ui_map.elements.len(),
243            "viewport": {
244                "width": viewport.width,
245                "height": viewport.height,
246            }
247        }))
248    }
249}
250
251#[async_trait]
252impl ToolExecutor for BrowserToolExecutor {
253    async fn execute(&self, tool: &str, params: &Value) -> Result<Value, String> {
254        match tool {
255            "browse_navigate" => self.handle_navigate(params).await,
256            "browse_click" => self.handle_click(params).await,
257            "browse_type" => self.handle_type(params).await,
258            "browse_scroll" => self.handle_scroll(params).await,
259            "browse_observe" => self.handle_observe(params).await,
260            _ => Err(format!("Unknown browser tool: {tool}")),
261        }
262    }
263}