Skip to main content

car_browser/
tools.rs

1//! Browser tool registration and execution for car-engine.
2//!
3//! Registers `browse_*` tools with the CAR runtime and dispatches tool calls
4//! to the appropriate `BrowserBackend` methods.
5
6use std::sync::Arc;
7
8use async_trait::async_trait;
9use car_engine::ToolExecutor;
10use car_ir::ToolSchema;
11use serde_json::{json, Value};
12use tokio::sync::RwLock;
13
14use crate::backend::BrowserBackend;
15use crate::models::{Modifier, WaitCondition};
16use crate::perception::pipeline::PerceptionPipeline;
17use crate::perception::ui_map::UiMap;
18
19/// Tool executor that dispatches `browse_*` tool calls to a `BrowserBackend`.
20///
21/// Maintains a reference to the last UiMap from `browse_observe` to resolve
22/// `el_N` IDs to AX node IDs before passing to the backend.
23pub struct BrowserToolExecutor {
24    backend: Arc<dyn BrowserBackend>,
25    pipeline: Arc<dyn PerceptionPipeline>,
26    /// Last observed UiMap — used to resolve el_N → ax_ref for click/type/focus.
27    last_ui_map: Arc<RwLock<Option<UiMap>>>,
28}
29
30impl BrowserToolExecutor {
31    /// Create a new browser tool executor.
32    pub fn new(
33        backend: Arc<dyn BrowserBackend>,
34        pipeline: Arc<dyn PerceptionPipeline>,
35    ) -> Self {
36        Self {
37            backend,
38            pipeline,
39            last_ui_map: Arc::new(RwLock::new(None)),
40        }
41    }
42
43    /// Resolve an element ID (el_N) to the backend's AX node ID.
44    ///
45    /// Falls back to the raw ID if no UiMap is available or element not found.
46    async fn resolve_element_id(&self, element_id: &str) -> String {
47        let guard = self.last_ui_map.read().await;
48        if let Some(ui_map) = guard.as_ref() {
49            if let Some(element) = ui_map.get_element(element_id) {
50                if let Some(ref ax_ref) = element.ax_ref {
51                    return ax_ref.clone();
52                }
53            }
54        }
55        // Fallback: pass through as-is (may be a raw AX node ID already)
56        element_id.to_string()
57    }
58
59    /// Get the tool schemas for all browser tools.
60    pub fn tool_schemas() -> Vec<ToolSchema> {
61        vec![
62            ToolSchema {
63                name: "browse_navigate".to_string(),
64                description: "Navigate the browser to a URL".to_string(),
65                parameters: json!({
66                    "type": "object",
67                    "properties": {
68                        "url": { "type": "string", "description": "URL to navigate to" }
69                    },
70                    "required": ["url"]
71                }),
72                returns: Some(json!({"type": "object", "properties": {"url": {"type": "string"}}})),
73                idempotent: false,
74                cache_ttl_secs: None,
75                rate_limit: None,
76            },
77            ToolSchema {
78                name: "browse_click".to_string(),
79                description: "Click on a UI element by accessibility node ID".to_string(),
80                parameters: json!({
81                    "type": "object",
82                    "properties": {
83                        "element_id": { "type": "string", "description": "Accessibility node ID (e.g. 'el_5')" }
84                    },
85                    "required": ["element_id"]
86                }),
87                returns: Some(json!({"type": "object"})),
88                idempotent: false,
89                cache_ttl_secs: None,
90                rate_limit: None,
91            },
92            ToolSchema {
93                name: "browse_type".to_string(),
94                description: "Type text into a UI element by accessibility node ID".to_string(),
95                parameters: json!({
96                    "type": "object",
97                    "properties": {
98                        "element_id": { "type": "string", "description": "Accessibility node ID of a text field" },
99                        "text": { "type": "string", "description": "Text to enter" }
100                    },
101                    "required": ["element_id", "text"]
102                }),
103                returns: Some(json!({"type": "object"})),
104                idempotent: false,
105                cache_ttl_secs: None,
106                rate_limit: None,
107            },
108            ToolSchema {
109                name: "browse_scroll".to_string(),
110                description: "Scroll the browser page".to_string(),
111                parameters: json!({
112                    "type": "object",
113                    "properties": {
114                        "delta_y": { "type": "integer", "description": "Scroll amount (positive = down, negative = up)" }
115                    },
116                    "required": ["delta_y"]
117                }),
118                returns: Some(json!({"type": "object"})),
119                idempotent: false,
120                cache_ttl_secs: None,
121                rate_limit: None,
122            },
123            ToolSchema {
124                name: "browse_keypress".to_string(),
125                description: "Press a key with optional modifiers".to_string(),
126                parameters: json!({
127                    "type": "object",
128                    "properties": {
129                        "key": { "type": "string", "description": "Key to press (e.g. 'Enter', 'a', 'Tab')" },
130                        "modifiers": {
131                            "type": "array",
132                            "items": { "type": "string", "enum": ["shift", "control", "alt", "meta"] },
133                            "description": "Optional modifier keys to hold during keypress"
134                        }
135                    },
136                    "required": ["key"]
137                }),
138                returns: Some(json!({"type": "object", "properties": {"key": {"type": "string"}, "status": {"type": "string"}}})),
139                idempotent: false,
140                cache_ttl_secs: None,
141                rate_limit: None,
142            },
143            ToolSchema {
144                name: "browse_wait".to_string(),
145                description: "Wait for a browser condition to be met".to_string(),
146                parameters: json!({
147                    "type": "object",
148                    "properties": {
149                        "condition": { "type": "string", "description": "Condition to wait for: 'page_loaded' or 'url_changed'" },
150                        "timeout_ms": { "type": "number", "description": "Timeout in milliseconds (default: 5000)" }
151                    },
152                    "required": ["condition"]
153                }),
154                returns: Some(json!({"type": "object", "properties": {"condition": {"type": "string"}, "met": {"type": "boolean"}}})),
155                idempotent: true,
156                cache_ttl_secs: None,
157                rate_limit: None,
158            },
159            ToolSchema {
160                name: "browse_observe".to_string(),
161                description: "Observe the current browser state: take screenshot, extract accessibility tree, produce UiMap".to_string(),
162                parameters: json!({
163                    "type": "object",
164                    "properties": {
165                        "include_screenshot": { "type": "boolean", "description": "Include base64 screenshot inline (default: false, returns file path instead)" }
166                    }
167                }),
168                returns: Some(json!({
169                    "type": "object",
170                    "properties": {
171                        "url": {"type": "string"},
172                        "title": {"type": "string"},
173                        "ui_map": {"type": "string"},
174                        "screenshot_path": {"type": "string"},
175                        "screenshot_base64": {"type": "string", "description": "Only present if include_screenshot=true"}
176                    }
177                })),
178                idempotent: true,
179                cache_ttl_secs: None,
180                rate_limit: None,
181            },
182        ]
183    }
184
185    async fn handle_navigate(&self, params: &Value) -> Result<Value, String> {
186        let url = params
187            .get("url")
188            .and_then(|v| v.as_str())
189            .ok_or("Missing required parameter: url")?;
190        self.backend
191            .navigate(url)
192            .await
193            .map_err(|e| e.to_string())?;
194        Ok(json!({"url": url, "status": "navigated"}))
195    }
196
197    async fn handle_click(&self, params: &Value) -> Result<Value, String> {
198        let element_id = params
199            .get("element_id")
200            .and_then(|v| v.as_str())
201            .ok_or("Missing required parameter: element_id")?;
202        let resolved_id = self.resolve_element_id(element_id).await;
203        self.backend
204            .click_element(&resolved_id)
205            .await
206            .map_err(|e| e.to_string())?;
207        Ok(json!({"element_id": element_id, "resolved_id": resolved_id, "status": "clicked"}))
208    }
209
210    async fn handle_type(&self, params: &Value) -> Result<Value, String> {
211        let element_id = params
212            .get("element_id")
213            .and_then(|v| v.as_str())
214            .ok_or("Missing required parameter: element_id")?;
215        let text = params
216            .get("text")
217            .and_then(|v| v.as_str())
218            .ok_or("Missing required parameter: text")?;
219        let resolved_id = self.resolve_element_id(element_id).await;
220        self.backend
221            .type_into_element(&resolved_id, text)
222            .await
223            .map_err(|e| e.to_string())?;
224        Ok(json!({"element_id": element_id, "resolved_id": resolved_id, "text": text, "status": "typed"}))
225    }
226
227    async fn handle_scroll(&self, params: &Value) -> Result<Value, String> {
228        let delta_y = params
229            .get("delta_y")
230            .and_then(|v| v.as_i64())
231            .ok_or("Missing required parameter: delta_y")? as i32;
232        self.backend
233            .inject_scroll(delta_y)
234            .await
235            .map_err(|e| e.to_string())?;
236        Ok(json!({"delta_y": delta_y, "status": "scrolled"}))
237    }
238
239    async fn handle_keypress(&self, params: &Value) -> Result<Value, String> {
240        let key = params
241            .get("key")
242            .and_then(|v| v.as_str())
243            .ok_or("Missing required parameter: key")?;
244        let modifiers: Vec<Modifier> = params
245            .get("modifiers")
246            .and_then(|v| v.as_array())
247            .map(|arr| {
248                arr.iter()
249                    .filter_map(|m| match m.as_str()? {
250                        "shift" => Some(Modifier::Shift),
251                        "control" => Some(Modifier::Control),
252                        "alt" => Some(Modifier::Alt),
253                        "meta" => Some(Modifier::Meta),
254                        _ => None,
255                    })
256                    .collect()
257            })
258            .unwrap_or_default();
259        self.backend
260            .inject_keypress(key, &modifiers)
261            .await
262            .map_err(|e| e.to_string())?;
263        Ok(json!({"key": key, "status": "pressed"}))
264    }
265
266    async fn handle_wait(&self, params: &Value) -> Result<Value, String> {
267        let condition_str = params
268            .get("condition")
269            .and_then(|v| v.as_str())
270            .ok_or("Missing required parameter: condition")?;
271        let timeout_ms = params
272            .get("timeout_ms")
273            .and_then(|v| v.as_u64())
274            .unwrap_or(5000);
275        // Accepted shapes:
276        //   "page_loaded" | "url_changed"
277        //   "a11y_contains_text:<text>"
278        //   "element_with_name:<name>"
279        //   "element_with_name:<name>@<role>"
280        let condition = match condition_str {
281            "page_loaded" => WaitCondition::PageLoaded,
282            "url_changed" => WaitCondition::UrlChanged,
283            s if s.starts_with("a11y_contains_text:") => WaitCondition::A11yContainsText {
284                text: s["a11y_contains_text:".len()..].to_string(),
285            },
286            s if s.starts_with("element_with_name:") => {
287                let rest = &s["element_with_name:".len()..];
288                let (name_contains, role) = match rest.split_once('@') {
289                    Some((n, r)) => (n.to_string(), Some(r.to_string())),
290                    None => (rest.to_string(), None),
291                };
292                WaitCondition::ElementWithName {
293                    name_contains,
294                    role,
295                }
296            }
297            other => return Err(format!("Unknown wait condition: {other}")),
298        };
299        let met = self
300            .backend
301            .wait_until(&condition, timeout_ms)
302            .await
303            .map_err(|e| e.to_string())?;
304        Ok(json!({"condition": condition_str, "met": met}))
305    }
306
307    async fn handle_observe(&self, _params: &Value) -> Result<Value, String> {
308        let screenshot = self
309            .backend
310            .capture_screenshot()
311            .await
312            .map_err(|e| e.to_string())?;
313        let a11y_nodes = self
314            .backend
315            .get_accessibility_tree()
316            .await
317            .map_err(|e| e.to_string())?;
318        let url = self.backend.get_current_url().map_err(|e| e.to_string())?;
319        let title = self
320            .backend
321            .get_page_title()
322            .await
323            .map_err(|e| e.to_string())?;
324        let viewport = self.backend.get_viewport().map_err(|e| e.to_string())?;
325
326        let ui_map = self
327            .pipeline
328            .perceive(&screenshot, &a11y_nodes, &url, viewport)
329            .await
330            .map_err(|e| e.to_string())?;
331
332        // Store UiMap for element ID resolution in subsequent click/type calls
333        {
334            let mut guard = self.last_ui_map.write().await;
335            *guard = Some(ui_map.clone());
336        }
337
338        let ui_map_text = ui_map.format_summary();
339
340        // Save screenshot to temp file instead of inline base64 (saves ~3-5MB per observe)
341        let screenshot_path = {
342            let dir = std::env::temp_dir().join("car-browser-screenshots");
343            let _ = std::fs::create_dir_all(&dir);
344            let path = dir.join(format!("{}.png", uuid::Uuid::new_v4()));
345            std::fs::write(&path, &screenshot).map_err(|e| e.to_string())?;
346            path.to_string_lossy().to_string()
347        };
348
349        // Screenshot is always saved to disk — never inline base64 (saves ~80KB per observe).
350        // Vision models can read the file via screenshot_path if needed.
351        let result = json!({
352            "url": url,
353            "title": title,
354            "ui_map": ui_map_text,
355            "screenshot_path": screenshot_path,
356            "element_count": ui_map.elements.len(),
357            "viewport": {
358                "width": viewport.width,
359                "height": viewport.height,
360            }
361        });
362
363        Ok(result)
364    }
365}
366
367#[async_trait]
368impl ToolExecutor for BrowserToolExecutor {
369    async fn execute(&self, tool: &str, params: &Value) -> Result<Value, String> {
370        match tool {
371            "browse_navigate" => self.handle_navigate(params).await,
372            "browse_click" => self.handle_click(params).await,
373            "browse_type" => self.handle_type(params).await,
374            "browse_scroll" => self.handle_scroll(params).await,
375            "browse_keypress" => self.handle_keypress(params).await,
376            "browse_wait" => self.handle_wait(params).await,
377            "browse_observe" => self.handle_observe(params).await,
378            _ => Err(format!("Unknown browser tool: {tool}")),
379        }
380    }
381}