1use async_trait::async_trait;
2use base64::Engine;
3use serde_json::json;
4use std::io::Write;
5use std::process::Stdio;
6
7use crate::event::{Block, RiskLevel};
8use crate::tools::{Tool, ToolCtx, ToolResult};
9
10const PLAYWRIGHT_DRIVER: &str = include_str!("../../scripts/playwright-driver.mjs");
11
12pub struct BrowserTool;
21
22#[async_trait]
23impl Tool for BrowserTool {
24 fn name(&self) -> &str {
25 "browser"
26 }
27
28 fn description(&self) -> &str {
29 "Control a Playwright headless browser: navigate, screenshot, click, type, extract text, or evaluate JavaScript"
30 }
31
32 fn schema(&self) -> serde_json::Value {
33 browser_schema(&[
34 "navigate",
35 "screenshot",
36 "get_text",
37 "extract",
38 "click",
39 "type",
40 "press",
41 "evaluate",
42 ])
43 }
44
45 fn risk(&self) -> RiskLevel {
46 RiskLevel::Network
47 }
48
49 async fn call(&self, args: serde_json::Value, ctx: &ToolCtx) -> anyhow::Result<ToolResult> {
50 run_playwright(args, ctx, false).await
51 }
52}
53
54pub struct ComputerTool;
61
62#[async_trait]
63impl Tool for ComputerTool {
64 fn name(&self) -> &str {
65 "computer"
66 }
67
68 fn description(&self) -> &str {
69 "Computer-use actions through Playwright Chromium: screenshot, click, type, and key press, gated as sandboxed exec"
70 }
71
72 fn schema(&self) -> serde_json::Value {
73 browser_schema(&["screenshot", "click", "type", "press"])
74 }
75
76 fn risk(&self) -> RiskLevel {
77 RiskLevel::Exec
78 }
79
80 async fn call(&self, args: serde_json::Value, ctx: &ToolCtx) -> anyhow::Result<ToolResult> {
81 run_playwright(args, ctx, true).await
82 }
83}
84
85fn browser_schema(actions: &[&str]) -> serde_json::Value {
86 json!({
87 "type": "object",
88 "properties": {
89 "action": { "type": "string", "enum": actions },
90 "url": { "type": "string", "description": "URL to open before the action; defaults to about:blank" },
91 "selector": { "type": "string", "description": "CSS selector for click/type/extract or element screenshot" },
92 "x": { "type": "number", "description": "Viewport X coordinate for computer click/type actions" },
93 "y": { "type": "number", "description": "Viewport Y coordinate for computer click/type actions" },
94 "button": { "type": "string", "enum": ["left", "right", "middle"], "description": "Mouse button for coordinate click actions" },
95 "click_count": { "type": "integer", "description": "Number of clicks for coordinate click actions" },
96 "text": { "type": "string", "description": "Text for type/fill actions" },
97 "key": { "type": "string", "description": "Keyboard key or chord for press actions, e.g. Enter or Control+A" },
98 "js": { "type": "string", "description": "JavaScript expression/function body for evaluate" },
99 "timeout_ms": { "type": "integer", "description": "Timeout in milliseconds, default 30000" },
100 "wait_until": { "type": "string", "description": "Playwright navigation wait state, default networkidle" },
101 "headless": { "type": "boolean", "description": "Run Chromium headless unless false" },
102 "session_id": { "type": "string", "description": "Optional persistent browser session id for multi-step browser/computer-use" },
103 "full_page": { "type": "boolean", "description": "For screenshots, capture the full page unless false" },
104 "viewport": {
105 "type": "object",
106 "properties": {
107 "width": { "type": "integer" },
108 "height": { "type": "integer" },
109 "deviceScaleFactor": { "type": "number" }
110 }
111 }
112 },
113 "required": ["action"]
114 })
115}
116
117async fn run_playwright(
118 mut args: serde_json::Value,
119 ctx: &ToolCtx,
120 require_computer_action: bool,
121) -> anyhow::Result<ToolResult> {
122 let action = args["action"].as_str().unwrap_or("navigate").to_string();
123 if require_computer_action
124 && !matches!(action.as_str(), "screenshot" | "click" | "type" | "press")
125 {
126 return Ok(ToolResult::error(format!(
127 "computer only supports screenshot, click, type, and press (got {})",
128 action
129 )));
130 }
131
132 if args.get("url").and_then(|v| v.as_str()).is_none()
133 && args.get("session_id").and_then(|v| v.as_str()).is_none()
134 {
135 args["url"] = json!("about:blank");
136 }
137
138 let driver_path = materialize_driver()?;
139 let workspace_root = ctx.workspace_root.clone();
140 let output = tokio::task::spawn_blocking(move || {
141 invoke_node_driver(&driver_path, &args, &workspace_root)
142 })
143 .await
144 .map_err(|e| anyhow::anyhow!("browser task join error: {}", e))??;
145
146 parse_driver_output(&action, &output)
147}
148
149fn materialize_driver() -> anyhow::Result<std::path::PathBuf> {
150 let dir = std::env::temp_dir().join("sparrow-playwright");
151 std::fs::create_dir_all(&dir)?;
152 let path = dir.join(format!("driver-{}.mjs", env!("CARGO_PKG_VERSION")));
153 if !path.exists() || std::fs::read_to_string(&path).ok().as_deref() != Some(PLAYWRIGHT_DRIVER) {
154 std::fs::write(&path, PLAYWRIGHT_DRIVER)?;
155 }
156 Ok(path)
157}
158
159fn invoke_node_driver(
160 driver_path: &std::path::Path,
161 args: &serde_json::Value,
162 workspace_root: &std::path::Path,
163) -> anyhow::Result<String> {
164 let mut child = build_command(driver_path, workspace_root)
165 .stdin(Stdio::piped())
166 .stdout(Stdio::piped())
167 .stderr(Stdio::piped())
168 .spawn()
169 .map_err(|e| {
170 anyhow::anyhow!(
171 "failed to launch Playwright driver via Node.js: {}. Install Node.js, then run `npm install` and `npx playwright install chromium`.",
172 e
173 )
174 })?;
175
176 if let Some(stdin) = child.stdin.as_mut() {
177 stdin.write_all(serde_json::to_string(args)?.as_bytes())?;
178 }
179
180 let output = child.wait_with_output()?;
181 if !output.status.success() {
182 anyhow::bail!(
183 "Playwright driver exited {}: {}",
184 output.status,
185 String::from_utf8_lossy(&output.stderr)
186 );
187 }
188 let stdout = String::from_utf8(output.stdout)?;
189 if stdout.trim().is_empty() {
190 anyhow::bail!(
191 "Playwright driver returned no output: {}",
192 String::from_utf8_lossy(&output.stderr)
193 );
194 }
195 Ok(stdout)
196}
197
198fn build_command(
199 driver_path: &std::path::Path,
200 workspace_root: &std::path::Path,
201) -> std::process::Command {
202 #[cfg(target_os = "linux")]
203 {
204 if std::env::var("SPARROW_BROWSER_BWRAP").ok().as_deref() != Some("0") && which("bwrap") {
205 let mut cmd = std::process::Command::new("bwrap");
206 cmd.arg("--die-with-parent")
207 .arg("--unshare-pid")
208 .arg("--proc")
209 .arg("/proc")
210 .arg("--dev")
211 .arg("/dev")
212 .arg("--tmpfs")
213 .arg("/tmp");
214 for path in ["/usr", "/bin", "/lib", "/lib64", "/etc"] {
215 add_ro_bind_if_exists(&mut cmd, path);
216 }
217 if let Some(home) = std::env::var_os("HOME") {
218 let cache = std::path::PathBuf::from(home)
219 .join(".cache")
220 .join("ms-playwright");
221 add_bind_if_exists(&mut cmd, &cache);
222 }
223 add_bind_if_exists(&mut cmd, &std::env::temp_dir());
224 cmd.arg("--bind")
225 .arg(workspace_root)
226 .arg(workspace_root)
227 .arg("--chdir")
228 .arg(workspace_root)
229 .arg("node")
230 .arg(driver_path);
231 return cmd;
232 }
233 }
234
235 let mut cmd = std::process::Command::new("node");
236 cmd.arg(driver_path).current_dir(workspace_root);
237 cmd
238}
239
240fn parse_driver_output(action: &str, stdout: &str) -> anyhow::Result<ToolResult> {
241 let value: serde_json::Value = serde_json::from_str(stdout.trim())?;
242 if value.get("ok").and_then(|v| v.as_bool()) != Some(true) {
243 let error = value
244 .get("error")
245 .and_then(|v| v.as_str())
246 .unwrap_or("Playwright driver failed");
247 let detail = value.get("detail").and_then(|v| v.as_str()).unwrap_or("");
248 return Ok(ToolResult::error(
249 format!("{}{}", error, if detail.is_empty() { "" } else { ": " }) + detail,
250 ));
251 }
252
253 if let Some(image) = value.get("image_base64").and_then(|v| v.as_str()) {
254 let data = base64::engine::general_purpose::STANDARD.decode(image)?;
255 return Ok(ToolResult::ok(vec![Block::Image {
256 data,
257 mime: value
258 .get("mime")
259 .and_then(|v| v.as_str())
260 .unwrap_or("image/png")
261 .to_string(),
262 }]));
263 }
264
265 if let Some(text) = value.get("text").and_then(|v| v.as_str()) {
266 return Ok(ToolResult::text(text.to_string()));
267 }
268
269 if let Some(result) = value.get("result") {
270 return Ok(ToolResult::text(result.to_string()));
271 }
272
273 let title = value.get("title").and_then(|v| v.as_str()).unwrap_or("");
274 let url = value.get("url").and_then(|v| v.as_str()).unwrap_or("");
275 Ok(ToolResult::text(format!(
276 "{} ok{}{}",
277 action,
278 if url.is_empty() { "" } else { " · " },
279 if title.is_empty() { url } else { title }
280 )))
281}
282
283#[cfg(target_os = "linux")]
284fn which(cmd: &str) -> bool {
285 std::process::Command::new("sh")
286 .arg("-c")
287 .arg(format!("command -v {}", cmd))
288 .output()
289 .map(|o| o.status.success())
290 .unwrap_or(false)
291}
292
293#[cfg(target_os = "linux")]
294fn add_ro_bind_if_exists(cmd: &mut std::process::Command, path: &str) {
295 let path = std::path::Path::new(path);
296 if path.exists() {
297 cmd.arg("--ro-bind").arg(path).arg(path);
298 }
299}
300
301#[cfg(target_os = "linux")]
302fn add_bind_if_exists(cmd: &mut std::process::Command, path: &std::path::Path) {
303 if path.exists() {
304 cmd.arg("--bind").arg(path).arg(path);
305 }
306}
307
308#[cfg(test)]
309mod tests {
310 use super::*;
311
312 #[test]
313 fn parses_screenshot_payload_as_image_block() {
314 let out = json!({
315 "ok": true,
316 "mime": "image/png",
317 "image_base64": base64::engine::general_purpose::STANDARD.encode([1_u8, 2, 3]),
318 });
319 let result = parse_driver_output("screenshot", &out.to_string()).unwrap();
320 assert!(!result.is_error);
321 assert!(matches!(
322 result.content.as_slice(),
323 [Block::Image { mime, data }] if mime == "image/png" && data == &vec![1, 2, 3]
324 ));
325 }
326
327 #[test]
328 fn computer_rejects_non_computer_actions_before_driver_launch() {
329 let rt = tokio::runtime::Runtime::new().unwrap();
330 let result = rt.block_on(async {
331 ComputerTool
332 .call(
333 json!({"action": "navigate", "url": "https://example.com"}),
334 &ToolCtx {
335 workspace_root: std::env::current_dir().unwrap(),
336 run_id: crate::event::RunId("test".into()),
337 },
338 )
339 .await
340 .unwrap()
341 });
342 assert!(result.is_error);
343 }
344
345 #[test]
346 fn computer_schema_exposes_coordinate_session_and_press_controls() {
347 let schema = ComputerTool.schema();
348 let actions = schema["properties"]["action"]["enum"]
349 .as_array()
350 .expect("action enum");
351 assert!(actions.iter().any(|v| v == "press"));
352 assert!(schema["properties"].get("x").is_some());
353 assert!(schema["properties"].get("y").is_some());
354 assert!(schema["properties"].get("session_id").is_some());
355 assert!(schema["properties"].get("key").is_some());
356 }
357
358 #[test]
359 fn embedded_driver_resolves_playwright_from_workspace() {
360 assert!(PLAYWRIGHT_DRIVER.contains("createRequire"));
361 assert!(PLAYWRIGHT_DRIVER.contains("process.cwd()"));
362 assert!(PLAYWRIGHT_DRIVER.contains("SPARROW_PLAYWRIGHT_ROOT"));
363 assert!(!PLAYWRIGHT_DRIVER.contains("from \"playwright\""));
364 }
365}