use std::sync::Arc;
use async_trait::async_trait;
use car_engine::ToolExecutor;
use car_ir::ToolSchema;
use serde_json::{json, Value};
use tokio::sync::RwLock;
use crate::backend::BrowserBackend;
use crate::models::{Modifier, WaitCondition};
use crate::perception::pipeline::PerceptionPipeline;
use crate::perception::ui_map::UiMap;
pub struct BrowserToolExecutor {
backend: Arc<dyn BrowserBackend>,
pipeline: Arc<dyn PerceptionPipeline>,
last_ui_map: Arc<RwLock<Option<UiMap>>>,
}
impl BrowserToolExecutor {
pub fn new(backend: Arc<dyn BrowserBackend>, pipeline: Arc<dyn PerceptionPipeline>) -> Self {
Self {
backend,
pipeline,
last_ui_map: Arc::new(RwLock::new(None)),
}
}
async fn resolve_element_id(&self, element_id: &str) -> String {
let guard = self.last_ui_map.read().await;
if let Some(ui_map) = guard.as_ref() {
if let Some(element) = ui_map.get_element(element_id) {
if let Some(ref ax_ref) = element.ax_ref {
return ax_ref.clone();
}
}
}
element_id.to_string()
}
pub fn tool_schemas() -> Vec<ToolSchema> {
vec![
ToolSchema {
name: "browse_navigate".to_string(),
description: "Navigate the browser to a URL".to_string(),
parameters: json!({
"type": "object",
"properties": {
"url": { "type": "string", "description": "URL to navigate to" }
},
"required": ["url"]
}),
returns: Some(json!({"type": "object", "properties": {"url": {"type": "string"}}})),
idempotent: false,
cache_ttl_secs: None,
rate_limit: None,
},
ToolSchema {
name: "browse_click".to_string(),
description: "Click on a UI element by accessibility node ID".to_string(),
parameters: json!({
"type": "object",
"properties": {
"element_id": { "type": "string", "description": "Accessibility node ID (e.g. 'el_5')" }
},
"required": ["element_id"]
}),
returns: Some(json!({"type": "object"})),
idempotent: false,
cache_ttl_secs: None,
rate_limit: None,
},
ToolSchema {
name: "browse_type".to_string(),
description: "Type text into a UI element by accessibility node ID".to_string(),
parameters: json!({
"type": "object",
"properties": {
"element_id": { "type": "string", "description": "Accessibility node ID of a text field" },
"text": { "type": "string", "description": "Text to enter" }
},
"required": ["element_id", "text"]
}),
returns: Some(json!({"type": "object"})),
idempotent: false,
cache_ttl_secs: None,
rate_limit: None,
},
ToolSchema {
name: "browse_scroll".to_string(),
description: "Scroll the browser page".to_string(),
parameters: json!({
"type": "object",
"properties": {
"delta_y": { "type": "integer", "description": "Scroll amount (positive = down, negative = up)" }
},
"required": ["delta_y"]
}),
returns: Some(json!({"type": "object"})),
idempotent: false,
cache_ttl_secs: None,
rate_limit: None,
},
ToolSchema {
name: "browse_keypress".to_string(),
description: "Press a key with optional modifiers".to_string(),
parameters: json!({
"type": "object",
"properties": {
"key": { "type": "string", "description": "Key to press (e.g. 'Enter', 'a', 'Tab')" },
"modifiers": {
"type": "array",
"items": { "type": "string", "enum": ["shift", "control", "alt", "meta"] },
"description": "Optional modifier keys to hold during keypress"
}
},
"required": ["key"]
}),
returns: Some(json!({"type": "object", "properties": {"key": {"type": "string"}, "status": {"type": "string"}}})),
idempotent: false,
cache_ttl_secs: None,
rate_limit: None,
},
ToolSchema {
name: "browse_wait".to_string(),
description: "Wait for a browser condition to be met".to_string(),
parameters: json!({
"type": "object",
"properties": {
"condition": { "type": "string", "description": "Condition to wait for: 'page_loaded' or 'url_changed'" },
"timeout_ms": { "type": "number", "description": "Timeout in milliseconds (default: 5000)" }
},
"required": ["condition"]
}),
returns: Some(json!({"type": "object", "properties": {"condition": {"type": "string"}, "met": {"type": "boolean"}}})),
idempotent: true,
cache_ttl_secs: None,
rate_limit: None,
},
ToolSchema {
name: "browse_observe".to_string(),
description: "Observe the current browser state: take screenshot, extract accessibility tree, produce UiMap".to_string(),
parameters: json!({
"type": "object",
"properties": {
"include_screenshot": { "type": "boolean", "description": "Include base64 screenshot inline (default: false, returns file path instead)" }
}
}),
returns: Some(json!({
"type": "object",
"properties": {
"url": {"type": "string"},
"title": {"type": "string"},
"ui_map": {"type": "string"},
"screenshot_path": {"type": "string"},
"screenshot_base64": {"type": "string", "description": "Only present if include_screenshot=true"}
}
})),
idempotent: true,
cache_ttl_secs: None,
rate_limit: None,
},
]
}
async fn handle_navigate(&self, params: &Value) -> Result<Value, String> {
let url = params
.get("url")
.and_then(|v| v.as_str())
.ok_or("Missing required parameter: url")?;
self.backend
.navigate(url)
.await
.map_err(|e| e.to_string())?;
Ok(json!({"url": url, "status": "navigated"}))
}
async fn handle_click(&self, params: &Value) -> Result<Value, String> {
let element_id = params
.get("element_id")
.and_then(|v| v.as_str())
.ok_or("Missing required parameter: element_id")?;
let resolved_id = self.resolve_element_id(element_id).await;
self.backend
.click_element(&resolved_id)
.await
.map_err(|e| e.to_string())?;
Ok(json!({"element_id": element_id, "resolved_id": resolved_id, "status": "clicked"}))
}
async fn handle_type(&self, params: &Value) -> Result<Value, String> {
let element_id = params
.get("element_id")
.and_then(|v| v.as_str())
.ok_or("Missing required parameter: element_id")?;
let text = params
.get("text")
.and_then(|v| v.as_str())
.ok_or("Missing required parameter: text")?;
let resolved_id = self.resolve_element_id(element_id).await;
self.backend
.type_into_element(&resolved_id, text)
.await
.map_err(|e| e.to_string())?;
Ok(
json!({"element_id": element_id, "resolved_id": resolved_id, "text": text, "status": "typed"}),
)
}
async fn handle_scroll(&self, params: &Value) -> Result<Value, String> {
let delta_y = params
.get("delta_y")
.and_then(|v| v.as_i64())
.ok_or("Missing required parameter: delta_y")? as i32;
self.backend
.inject_scroll(delta_y)
.await
.map_err(|e| e.to_string())?;
Ok(json!({"delta_y": delta_y, "status": "scrolled"}))
}
async fn handle_keypress(&self, params: &Value) -> Result<Value, String> {
let key = params
.get("key")
.and_then(|v| v.as_str())
.ok_or("Missing required parameter: key")?;
let modifiers: Vec<Modifier> = params
.get("modifiers")
.and_then(|v| v.as_array())
.map(|arr| {
arr.iter()
.filter_map(|m| match m.as_str()? {
"shift" => Some(Modifier::Shift),
"control" => Some(Modifier::Control),
"alt" => Some(Modifier::Alt),
"meta" => Some(Modifier::Meta),
_ => None,
})
.collect()
})
.unwrap_or_default();
self.backend
.inject_keypress(key, &modifiers)
.await
.map_err(|e| e.to_string())?;
Ok(json!({"key": key, "status": "pressed"}))
}
async fn handle_wait(&self, params: &Value) -> Result<Value, String> {
let condition_str = params
.get("condition")
.and_then(|v| v.as_str())
.ok_or("Missing required parameter: condition")?;
let timeout_ms = params
.get("timeout_ms")
.and_then(|v| v.as_u64())
.unwrap_or(5000);
let condition = match condition_str {
"page_loaded" => WaitCondition::PageLoaded,
"url_changed" => WaitCondition::UrlChanged,
s if s.starts_with("a11y_contains_text:") => WaitCondition::A11yContainsText {
text: s["a11y_contains_text:".len()..].to_string(),
},
s if s.starts_with("element_with_name:") => {
let rest = &s["element_with_name:".len()..];
let (name_contains, role) = match rest.split_once('@') {
Some((n, r)) => (n.to_string(), Some(r.to_string())),
None => (rest.to_string(), None),
};
WaitCondition::ElementWithName {
name_contains,
role,
}
}
other => return Err(format!("Unknown wait condition: {other}")),
};
let met = self
.backend
.wait_until(&condition, timeout_ms)
.await
.map_err(|e| e.to_string())?;
Ok(json!({"condition": condition_str, "met": met}))
}
async fn handle_observe(&self, _params: &Value) -> Result<Value, String> {
let screenshot = self
.backend
.capture_screenshot()
.await
.map_err(|e| e.to_string())?;
let a11y_nodes = self
.backend
.get_accessibility_tree()
.await
.map_err(|e| e.to_string())?;
let url = self.backend.get_current_url().map_err(|e| e.to_string())?;
let title = self
.backend
.get_page_title()
.await
.map_err(|e| e.to_string())?;
let viewport = self.backend.get_viewport().map_err(|e| e.to_string())?;
let ui_map = self
.pipeline
.perceive(&screenshot, &a11y_nodes, &url, viewport)
.await
.map_err(|e| e.to_string())?;
{
let mut guard = self.last_ui_map.write().await;
*guard = Some(ui_map.clone());
}
let ui_map_text = ui_map.format_summary();
let screenshot_path = {
let dir = std::env::temp_dir().join("car-browser-screenshots");
let _ = std::fs::create_dir_all(&dir);
let path = dir.join(format!("{}.png", uuid::Uuid::new_v4()));
std::fs::write(&path, &screenshot).map_err(|e| e.to_string())?;
path.to_string_lossy().to_string()
};
let result = json!({
"url": url,
"title": title,
"ui_map": ui_map_text,
"screenshot_path": screenshot_path,
"element_count": ui_map.elements.len(),
"viewport": {
"width": viewport.width,
"height": viewport.height,
}
});
Ok(result)
}
}
#[async_trait]
impl ToolExecutor for BrowserToolExecutor {
async fn execute(&self, tool: &str, params: &Value) -> Result<Value, String> {
match tool {
"browse_navigate" => self.handle_navigate(params).await,
"browse_click" => self.handle_click(params).await,
"browse_type" => self.handle_type(params).await,
"browse_scroll" => self.handle_scroll(params).await,
"browse_keypress" => self.handle_keypress(params).await,
"browse_wait" => self.handle_wait(params).await,
"browse_observe" => self.handle_observe(params).await,
_ => Err(format!("Unknown browser tool: {tool}")),
}
}
}