Skip to main content

car_browser/
tools.rs

1//! Browser tool registration and execution for car-engine.
2//!
3//! Registers `browse_*` tools with the CAR runtime and dispatches tool calls
4//! to the appropriate `BrowserBackend` methods.
5
6use std::sync::Arc;
7
8use async_trait::async_trait;
9use car_engine::ToolExecutor;
10use car_ir::ToolSchema;
11use serde_json::{json, Value};
12use tokio::sync::RwLock;
13
14use crate::backend::BrowserBackend;
15use crate::models::{Modifier, WaitCondition};
16use crate::perception::pipeline::PerceptionPipeline;
17use crate::perception::ui_map::UiMap;
18
19/// Tool executor that dispatches `browse_*` tool calls to a `BrowserBackend`.
20///
21/// Maintains a reference to the last UiMap from `browse_observe` to resolve
22/// `el_N` IDs to AX node IDs before passing to the backend.
23pub struct BrowserToolExecutor {
24    backend: Arc<dyn BrowserBackend>,
25    pipeline: Arc<dyn PerceptionPipeline>,
26    /// Last observed UiMap — used to resolve el_N → ax_ref for click/type/focus.
27    last_ui_map: Arc<RwLock<Option<UiMap>>>,
28}
29
30impl BrowserToolExecutor {
31    /// Create a new browser tool executor.
32    pub fn new(backend: Arc<dyn BrowserBackend>, pipeline: Arc<dyn PerceptionPipeline>) -> Self {
33        Self {
34            backend,
35            pipeline,
36            last_ui_map: Arc::new(RwLock::new(None)),
37        }
38    }
39
40    /// Resolve an element ID (el_N) to the backend's AX node ID.
41    ///
42    /// Falls back to the raw ID if no UiMap is available or element not found.
43    async fn resolve_element_id(&self, element_id: &str) -> String {
44        let guard = self.last_ui_map.read().await;
45        if let Some(ui_map) = guard.as_ref() {
46            if let Some(element) = ui_map.get_element(element_id) {
47                if let Some(ref ax_ref) = element.ax_ref {
48                    return ax_ref.clone();
49                }
50            }
51        }
52        // Fallback: pass through as-is (may be a raw AX node ID already)
53        element_id.to_string()
54    }
55
56    /// Get the tool schemas for all browser tools.
57    pub fn tool_schemas() -> Vec<ToolSchema> {
58        vec![
59            ToolSchema {
60                name: "browse_navigate".to_string(),
61                description: "Navigate the browser to a URL".to_string(),
62                parameters: json!({
63                    "type": "object",
64                    "properties": {
65                        "url": { "type": "string", "description": "URL to navigate to" }
66                    },
67                    "required": ["url"]
68                }),
69                returns: Some(json!({"type": "object", "properties": {"url": {"type": "string"}}})),
70                idempotent: false,
71                cache_ttl_secs: None,
72                rate_limit: None,
73            },
74            ToolSchema {
75                name: "browse_click".to_string(),
76                description: "Click on a UI element by accessibility node ID".to_string(),
77                parameters: json!({
78                    "type": "object",
79                    "properties": {
80                        "element_id": { "type": "string", "description": "Accessibility node ID (e.g. 'el_5')" }
81                    },
82                    "required": ["element_id"]
83                }),
84                returns: Some(json!({"type": "object"})),
85                idempotent: false,
86                cache_ttl_secs: None,
87                rate_limit: None,
88            },
89            ToolSchema {
90                name: "browse_type".to_string(),
91                description: "Type text into a UI element by accessibility node ID".to_string(),
92                parameters: json!({
93                    "type": "object",
94                    "properties": {
95                        "element_id": { "type": "string", "description": "Accessibility node ID of a text field" },
96                        "text": { "type": "string", "description": "Text to enter" }
97                    },
98                    "required": ["element_id", "text"]
99                }),
100                returns: Some(json!({"type": "object"})),
101                idempotent: false,
102                cache_ttl_secs: None,
103                rate_limit: None,
104            },
105            ToolSchema {
106                name: "browse_scroll".to_string(),
107                description: "Scroll the browser page".to_string(),
108                parameters: json!({
109                    "type": "object",
110                    "properties": {
111                        "delta_y": { "type": "integer", "description": "Scroll amount (positive = down, negative = up)" }
112                    },
113                    "required": ["delta_y"]
114                }),
115                returns: Some(json!({"type": "object"})),
116                idempotent: false,
117                cache_ttl_secs: None,
118                rate_limit: None,
119            },
120            ToolSchema {
121                name: "browse_keypress".to_string(),
122                description: "Press a key with optional modifiers".to_string(),
123                parameters: json!({
124                    "type": "object",
125                    "properties": {
126                        "key": { "type": "string", "description": "Key to press (e.g. 'Enter', 'a', 'Tab')" },
127                        "modifiers": {
128                            "type": "array",
129                            "items": { "type": "string", "enum": ["shift", "control", "alt", "meta"] },
130                            "description": "Optional modifier keys to hold during keypress"
131                        }
132                    },
133                    "required": ["key"]
134                }),
135                returns: Some(json!({"type": "object", "properties": {"key": {"type": "string"}, "status": {"type": "string"}}})),
136                idempotent: false,
137                cache_ttl_secs: None,
138                rate_limit: None,
139            },
140            ToolSchema {
141                name: "browse_wait".to_string(),
142                description: "Wait for a browser condition to be met".to_string(),
143                parameters: json!({
144                    "type": "object",
145                    "properties": {
146                        "condition": { "type": "string", "description": "Condition to wait for: 'page_loaded' or 'url_changed'" },
147                        "timeout_ms": { "type": "number", "description": "Timeout in milliseconds (default: 5000)" }
148                    },
149                    "required": ["condition"]
150                }),
151                returns: Some(json!({"type": "object", "properties": {"condition": {"type": "string"}, "met": {"type": "boolean"}}})),
152                idempotent: true,
153                cache_ttl_secs: None,
154                rate_limit: None,
155            },
156            ToolSchema {
157                name: "browse_observe".to_string(),
158                description: "Observe the current browser state: take screenshot, extract accessibility tree, produce UiMap".to_string(),
159                parameters: json!({
160                    "type": "object",
161                    "properties": {
162                        "include_screenshot": { "type": "boolean", "description": "Include base64 screenshot inline (default: false, returns file path instead)" }
163                    }
164                }),
165                returns: Some(json!({
166                    "type": "object",
167                    "properties": {
168                        "url": {"type": "string"},
169                        "title": {"type": "string"},
170                        "ui_map": {"type": "string"},
171                        "screenshot_path": {"type": "string"},
172                        "screenshot_base64": {"type": "string", "description": "Only present if include_screenshot=true"}
173                    }
174                })),
175                idempotent: true,
176                cache_ttl_secs: None,
177                rate_limit: None,
178            },
179        ]
180    }
181
182    async fn handle_navigate(&self, params: &Value) -> Result<Value, String> {
183        let url = params
184            .get("url")
185            .and_then(|v| v.as_str())
186            .ok_or("Missing required parameter: url")?;
187        self.backend
188            .navigate(url)
189            .await
190            .map_err(|e| e.to_string())?;
191        Ok(json!({"url": url, "status": "navigated"}))
192    }
193
194    async fn handle_click(&self, params: &Value) -> Result<Value, String> {
195        let element_id = params
196            .get("element_id")
197            .and_then(|v| v.as_str())
198            .ok_or("Missing required parameter: element_id")?;
199        let resolved_id = self.resolve_element_id(element_id).await;
200        self.backend
201            .click_element(&resolved_id)
202            .await
203            .map_err(|e| e.to_string())?;
204        Ok(json!({"element_id": element_id, "resolved_id": resolved_id, "status": "clicked"}))
205    }
206
207    async fn handle_type(&self, params: &Value) -> Result<Value, String> {
208        let element_id = params
209            .get("element_id")
210            .and_then(|v| v.as_str())
211            .ok_or("Missing required parameter: element_id")?;
212        let text = params
213            .get("text")
214            .and_then(|v| v.as_str())
215            .ok_or("Missing required parameter: text")?;
216        let resolved_id = self.resolve_element_id(element_id).await;
217        self.backend
218            .type_into_element(&resolved_id, text)
219            .await
220            .map_err(|e| e.to_string())?;
221        Ok(
222            json!({"element_id": element_id, "resolved_id": resolved_id, "text": text, "status": "typed"}),
223        )
224    }
225
226    async fn handle_scroll(&self, params: &Value) -> Result<Value, String> {
227        let delta_y = params
228            .get("delta_y")
229            .and_then(|v| v.as_i64())
230            .ok_or("Missing required parameter: delta_y")? as i32;
231        self.backend
232            .inject_scroll(delta_y)
233            .await
234            .map_err(|e| e.to_string())?;
235        Ok(json!({"delta_y": delta_y, "status": "scrolled"}))
236    }
237
238    async fn handle_keypress(&self, params: &Value) -> Result<Value, String> {
239        let key = params
240            .get("key")
241            .and_then(|v| v.as_str())
242            .ok_or("Missing required parameter: key")?;
243        let modifiers: Vec<Modifier> = params
244            .get("modifiers")
245            .and_then(|v| v.as_array())
246            .map(|arr| {
247                arr.iter()
248                    .filter_map(|m| match m.as_str()? {
249                        "shift" => Some(Modifier::Shift),
250                        "control" => Some(Modifier::Control),
251                        "alt" => Some(Modifier::Alt),
252                        "meta" => Some(Modifier::Meta),
253                        _ => None,
254                    })
255                    .collect()
256            })
257            .unwrap_or_default();
258        self.backend
259            .inject_keypress(key, &modifiers)
260            .await
261            .map_err(|e| e.to_string())?;
262        Ok(json!({"key": key, "status": "pressed"}))
263    }
264
265    async fn handle_wait(&self, params: &Value) -> Result<Value, String> {
266        let condition_str = params
267            .get("condition")
268            .and_then(|v| v.as_str())
269            .ok_or("Missing required parameter: condition")?;
270        let timeout_ms = params
271            .get("timeout_ms")
272            .and_then(|v| v.as_u64())
273            .unwrap_or(5000);
274        // Accepted shapes:
275        //   "page_loaded" | "url_changed"
276        //   "a11y_contains_text:<text>"
277        //   "element_with_name:<name>"
278        //   "element_with_name:<name>@<role>"
279        let condition = match condition_str {
280            "page_loaded" => WaitCondition::PageLoaded,
281            "url_changed" => WaitCondition::UrlChanged,
282            s if s.starts_with("a11y_contains_text:") => WaitCondition::A11yContainsText {
283                text: s["a11y_contains_text:".len()..].to_string(),
284            },
285            s if s.starts_with("element_with_name:") => {
286                let rest = &s["element_with_name:".len()..];
287                let (name_contains, role) = match rest.split_once('@') {
288                    Some((n, r)) => (n.to_string(), Some(r.to_string())),
289                    None => (rest.to_string(), None),
290                };
291                WaitCondition::ElementWithName {
292                    name_contains,
293                    role,
294                }
295            }
296            other => return Err(format!("Unknown wait condition: {other}")),
297        };
298        let met = self
299            .backend
300            .wait_until(&condition, timeout_ms)
301            .await
302            .map_err(|e| e.to_string())?;
303        Ok(json!({"condition": condition_str, "met": met}))
304    }
305
306    async fn handle_observe(&self, _params: &Value) -> Result<Value, String> {
307        let screenshot = self
308            .backend
309            .capture_screenshot()
310            .await
311            .map_err(|e| e.to_string())?;
312        let a11y_nodes = self
313            .backend
314            .get_accessibility_tree()
315            .await
316            .map_err(|e| e.to_string())?;
317        let url = self.backend.get_current_url().map_err(|e| e.to_string())?;
318        let title = self
319            .backend
320            .get_page_title()
321            .await
322            .map_err(|e| e.to_string())?;
323        let viewport = self.backend.get_viewport().map_err(|e| e.to_string())?;
324
325        let ui_map = self
326            .pipeline
327            .perceive(&screenshot, &a11y_nodes, &url, viewport)
328            .await
329            .map_err(|e| e.to_string())?;
330
331        // Store UiMap for element ID resolution in subsequent click/type calls
332        {
333            let mut guard = self.last_ui_map.write().await;
334            *guard = Some(ui_map.clone());
335        }
336
337        let ui_map_text = ui_map.format_summary();
338
339        // Save screenshot to temp file instead of inline base64 (saves ~3-5MB per observe)
340        let screenshot_path = {
341            let dir = std::env::temp_dir().join("car-browser-screenshots");
342            let _ = std::fs::create_dir_all(&dir);
343            let path = dir.join(format!("{}.png", uuid::Uuid::new_v4()));
344            std::fs::write(&path, &screenshot).map_err(|e| e.to_string())?;
345            path.to_string_lossy().to_string()
346        };
347
348        // Screenshot is always saved to disk — never inline base64 (saves ~80KB per observe).
349        // Vision models can read the file via screenshot_path if needed.
350        let result = json!({
351            "url": url,
352            "title": title,
353            "ui_map": ui_map_text,
354            "screenshot_path": screenshot_path,
355            "element_count": ui_map.elements.len(),
356            "viewport": {
357                "width": viewport.width,
358                "height": viewport.height,
359            }
360        });
361
362        Ok(result)
363    }
364}
365
366#[async_trait]
367impl ToolExecutor for BrowserToolExecutor {
368    async fn execute(&self, tool: &str, params: &Value) -> Result<Value, String> {
369        match tool {
370            "browse_navigate" => self.handle_navigate(params).await,
371            "browse_click" => self.handle_click(params).await,
372            "browse_type" => self.handle_type(params).await,
373            "browse_scroll" => self.handle_scroll(params).await,
374            "browse_keypress" => self.handle_keypress(params).await,
375            "browse_wait" => self.handle_wait(params).await,
376            "browse_observe" => self.handle_observe(params).await,
377            _ => Err(format!("Unknown browser tool: {tool}")),
378        }
379    }
380}