1use std::sync::Arc;
7
8use async_trait::async_trait;
9use car_engine::ToolExecutor;
10use car_ir::ToolSchema;
11use serde_json::{json, Value};
12use tokio::sync::RwLock;
13
14use crate::backend::BrowserBackend;
15use crate::models::{Modifier, WaitCondition};
16use crate::perception::pipeline::PerceptionPipeline;
17use crate::perception::ui_map::UiMap;
18
19pub struct BrowserToolExecutor {
24 backend: Arc<dyn BrowserBackend>,
25 pipeline: Arc<dyn PerceptionPipeline>,
26 last_ui_map: Arc<RwLock<Option<UiMap>>>,
28}
29
30impl BrowserToolExecutor {
31 pub fn new(backend: Arc<dyn BrowserBackend>, pipeline: Arc<dyn PerceptionPipeline>) -> Self {
33 Self {
34 backend,
35 pipeline,
36 last_ui_map: Arc::new(RwLock::new(None)),
37 }
38 }
39
40 async fn resolve_element_id(&self, element_id: &str) -> String {
44 let guard = self.last_ui_map.read().await;
45 if let Some(ui_map) = guard.as_ref() {
46 if let Some(element) = ui_map.get_element(element_id) {
47 if let Some(ref ax_ref) = element.ax_ref {
48 return ax_ref.clone();
49 }
50 }
51 }
52 element_id.to_string()
54 }
55
56 pub fn tool_schemas() -> Vec<ToolSchema> {
58 vec![
59 ToolSchema {
60 name: "browse_navigate".to_string(),
61 description: "Navigate the browser to a URL".to_string(),
62 parameters: json!({
63 "type": "object",
64 "properties": {
65 "url": { "type": "string", "description": "URL to navigate to" }
66 },
67 "required": ["url"]
68 }),
69 returns: Some(json!({"type": "object", "properties": {"url": {"type": "string"}}})),
70 idempotent: false,
71 cache_ttl_secs: None,
72 rate_limit: None,
73 },
74 ToolSchema {
75 name: "browse_click".to_string(),
76 description: "Click on a UI element by accessibility node ID".to_string(),
77 parameters: json!({
78 "type": "object",
79 "properties": {
80 "element_id": { "type": "string", "description": "Accessibility node ID (e.g. 'el_5')" }
81 },
82 "required": ["element_id"]
83 }),
84 returns: Some(json!({"type": "object"})),
85 idempotent: false,
86 cache_ttl_secs: None,
87 rate_limit: None,
88 },
89 ToolSchema {
90 name: "browse_type".to_string(),
91 description: "Type text into a UI element by accessibility node ID".to_string(),
92 parameters: json!({
93 "type": "object",
94 "properties": {
95 "element_id": { "type": "string", "description": "Accessibility node ID of a text field" },
96 "text": { "type": "string", "description": "Text to enter" }
97 },
98 "required": ["element_id", "text"]
99 }),
100 returns: Some(json!({"type": "object"})),
101 idempotent: false,
102 cache_ttl_secs: None,
103 rate_limit: None,
104 },
105 ToolSchema {
106 name: "browse_scroll".to_string(),
107 description: "Scroll the browser page".to_string(),
108 parameters: json!({
109 "type": "object",
110 "properties": {
111 "delta_y": { "type": "integer", "description": "Scroll amount (positive = down, negative = up)" }
112 },
113 "required": ["delta_y"]
114 }),
115 returns: Some(json!({"type": "object"})),
116 idempotent: false,
117 cache_ttl_secs: None,
118 rate_limit: None,
119 },
120 ToolSchema {
121 name: "browse_keypress".to_string(),
122 description: "Press a key with optional modifiers".to_string(),
123 parameters: json!({
124 "type": "object",
125 "properties": {
126 "key": { "type": "string", "description": "Key to press (e.g. 'Enter', 'a', 'Tab')" },
127 "modifiers": {
128 "type": "array",
129 "items": { "type": "string", "enum": ["shift", "control", "alt", "meta"] },
130 "description": "Optional modifier keys to hold during keypress"
131 }
132 },
133 "required": ["key"]
134 }),
135 returns: Some(json!({"type": "object", "properties": {"key": {"type": "string"}, "status": {"type": "string"}}})),
136 idempotent: false,
137 cache_ttl_secs: None,
138 rate_limit: None,
139 },
140 ToolSchema {
141 name: "browse_wait".to_string(),
142 description: "Wait for a browser condition to be met".to_string(),
143 parameters: json!({
144 "type": "object",
145 "properties": {
146 "condition": { "type": "string", "description": "Condition to wait for: 'page_loaded' or 'url_changed'" },
147 "timeout_ms": { "type": "number", "description": "Timeout in milliseconds (default: 5000)" }
148 },
149 "required": ["condition"]
150 }),
151 returns: Some(json!({"type": "object", "properties": {"condition": {"type": "string"}, "met": {"type": "boolean"}}})),
152 idempotent: true,
153 cache_ttl_secs: None,
154 rate_limit: None,
155 },
156 ToolSchema {
157 name: "browse_observe".to_string(),
158 description: "Observe the current browser state: take screenshot, extract accessibility tree, produce UiMap".to_string(),
159 parameters: json!({
160 "type": "object",
161 "properties": {
162 "include_screenshot": { "type": "boolean", "description": "Include base64 screenshot inline (default: false, returns file path instead)" }
163 }
164 }),
165 returns: Some(json!({
166 "type": "object",
167 "properties": {
168 "url": {"type": "string"},
169 "title": {"type": "string"},
170 "ui_map": {"type": "string"},
171 "screenshot_path": {"type": "string"},
172 "screenshot_base64": {"type": "string", "description": "Only present if include_screenshot=true"}
173 }
174 })),
175 idempotent: true,
176 cache_ttl_secs: None,
177 rate_limit: None,
178 },
179 ]
180 }
181
182 async fn handle_navigate(&self, params: &Value) -> Result<Value, String> {
183 let url = params
184 .get("url")
185 .and_then(|v| v.as_str())
186 .ok_or("Missing required parameter: url")?;
187 self.backend
188 .navigate(url)
189 .await
190 .map_err(|e| e.to_string())?;
191 Ok(json!({"url": url, "status": "navigated"}))
192 }
193
194 async fn handle_click(&self, params: &Value) -> Result<Value, String> {
195 let element_id = params
196 .get("element_id")
197 .and_then(|v| v.as_str())
198 .ok_or("Missing required parameter: element_id")?;
199 let resolved_id = self.resolve_element_id(element_id).await;
200 self.backend
201 .click_element(&resolved_id)
202 .await
203 .map_err(|e| e.to_string())?;
204 Ok(json!({"element_id": element_id, "resolved_id": resolved_id, "status": "clicked"}))
205 }
206
207 async fn handle_type(&self, params: &Value) -> Result<Value, String> {
208 let element_id = params
209 .get("element_id")
210 .and_then(|v| v.as_str())
211 .ok_or("Missing required parameter: element_id")?;
212 let text = params
213 .get("text")
214 .and_then(|v| v.as_str())
215 .ok_or("Missing required parameter: text")?;
216 let resolved_id = self.resolve_element_id(element_id).await;
217 self.backend
218 .type_into_element(&resolved_id, text)
219 .await
220 .map_err(|e| e.to_string())?;
221 Ok(
222 json!({"element_id": element_id, "resolved_id": resolved_id, "text": text, "status": "typed"}),
223 )
224 }
225
226 async fn handle_scroll(&self, params: &Value) -> Result<Value, String> {
227 let delta_y = params
228 .get("delta_y")
229 .and_then(|v| v.as_i64())
230 .ok_or("Missing required parameter: delta_y")? as i32;
231 self.backend
232 .inject_scroll(delta_y)
233 .await
234 .map_err(|e| e.to_string())?;
235 Ok(json!({"delta_y": delta_y, "status": "scrolled"}))
236 }
237
238 async fn handle_keypress(&self, params: &Value) -> Result<Value, String> {
239 let key = params
240 .get("key")
241 .and_then(|v| v.as_str())
242 .ok_or("Missing required parameter: key")?;
243 let modifiers: Vec<Modifier> = params
244 .get("modifiers")
245 .and_then(|v| v.as_array())
246 .map(|arr| {
247 arr.iter()
248 .filter_map(|m| match m.as_str()? {
249 "shift" => Some(Modifier::Shift),
250 "control" => Some(Modifier::Control),
251 "alt" => Some(Modifier::Alt),
252 "meta" => Some(Modifier::Meta),
253 _ => None,
254 })
255 .collect()
256 })
257 .unwrap_or_default();
258 self.backend
259 .inject_keypress(key, &modifiers)
260 .await
261 .map_err(|e| e.to_string())?;
262 Ok(json!({"key": key, "status": "pressed"}))
263 }
264
265 async fn handle_wait(&self, params: &Value) -> Result<Value, String> {
266 let condition_str = params
267 .get("condition")
268 .and_then(|v| v.as_str())
269 .ok_or("Missing required parameter: condition")?;
270 let timeout_ms = params
271 .get("timeout_ms")
272 .and_then(|v| v.as_u64())
273 .unwrap_or(5000);
274 let condition = match condition_str {
280 "page_loaded" => WaitCondition::PageLoaded,
281 "url_changed" => WaitCondition::UrlChanged,
282 s if s.starts_with("a11y_contains_text:") => WaitCondition::A11yContainsText {
283 text: s["a11y_contains_text:".len()..].to_string(),
284 },
285 s if s.starts_with("element_with_name:") => {
286 let rest = &s["element_with_name:".len()..];
287 let (name_contains, role) = match rest.split_once('@') {
288 Some((n, r)) => (n.to_string(), Some(r.to_string())),
289 None => (rest.to_string(), None),
290 };
291 WaitCondition::ElementWithName {
292 name_contains,
293 role,
294 }
295 }
296 other => return Err(format!("Unknown wait condition: {other}")),
297 };
298 let met = self
299 .backend
300 .wait_until(&condition, timeout_ms)
301 .await
302 .map_err(|e| e.to_string())?;
303 Ok(json!({"condition": condition_str, "met": met}))
304 }
305
306 async fn handle_observe(&self, _params: &Value) -> Result<Value, String> {
307 let screenshot = self
308 .backend
309 .capture_screenshot()
310 .await
311 .map_err(|e| e.to_string())?;
312 let a11y_nodes = self
313 .backend
314 .get_accessibility_tree()
315 .await
316 .map_err(|e| e.to_string())?;
317 let url = self.backend.get_current_url().map_err(|e| e.to_string())?;
318 let title = self
319 .backend
320 .get_page_title()
321 .await
322 .map_err(|e| e.to_string())?;
323 let viewport = self.backend.get_viewport().map_err(|e| e.to_string())?;
324
325 let ui_map = self
326 .pipeline
327 .perceive(&screenshot, &a11y_nodes, &url, viewport)
328 .await
329 .map_err(|e| e.to_string())?;
330
331 {
333 let mut guard = self.last_ui_map.write().await;
334 *guard = Some(ui_map.clone());
335 }
336
337 let ui_map_text = ui_map.format_summary();
338
339 let screenshot_path = {
341 let dir = std::env::temp_dir().join("car-browser-screenshots");
342 let _ = std::fs::create_dir_all(&dir);
343 let path = dir.join(format!("{}.png", uuid::Uuid::new_v4()));
344 std::fs::write(&path, &screenshot).map_err(|e| e.to_string())?;
345 path.to_string_lossy().to_string()
346 };
347
348 let result = json!({
351 "url": url,
352 "title": title,
353 "ui_map": ui_map_text,
354 "screenshot_path": screenshot_path,
355 "element_count": ui_map.elements.len(),
356 "viewport": {
357 "width": viewport.width,
358 "height": viewport.height,
359 }
360 });
361
362 Ok(result)
363 }
364}
365
366#[async_trait]
367impl ToolExecutor for BrowserToolExecutor {
368 async fn execute(&self, tool: &str, params: &Value) -> Result<Value, String> {
369 match tool {
370 "browse_navigate" => self.handle_navigate(params).await,
371 "browse_click" => self.handle_click(params).await,
372 "browse_type" => self.handle_type(params).await,
373 "browse_scroll" => self.handle_scroll(params).await,
374 "browse_keypress" => self.handle_keypress(params).await,
375 "browse_wait" => self.handle_wait(params).await,
376 "browse_observe" => self.handle_observe(params).await,
377 _ => Err(format!("Unknown browser tool: {tool}")),
378 }
379 }
380}