Skip to main content

sparrow/tools/
browser_sandbox.rs

1use async_trait::async_trait;
2use base64::Engine;
3use serde_json::json;
4use std::io::Write;
5use std::process::Stdio;
6
7use crate::event::{Block, RiskLevel};
8use crate::tools::{Tool, ToolCtx, ToolResult};
9
10const PLAYWRIGHT_DRIVER: &str = include_str!("../../scripts/playwright-driver.mjs");
11
12/// Browser automation through a real Playwright Chromium runtime.
13///
14/// The driver is embedded into the binary and materialized into a temporary
15/// `.mjs` file per call, so installed Sparrow binaries do not depend on a repo
16/// checkout. The host must provide Node.js plus the `playwright` package and a
17/// Chromium browser (`npm install && npx playwright install chromium`). The
18/// driver resolves Playwright from the workspace root even though the embedded
19/// script itself is materialized in the OS temp directory.
20pub struct BrowserTool;
21
22#[async_trait]
23impl Tool for BrowserTool {
24    fn name(&self) -> &str {
25        "browser"
26    }
27
28    fn description(&self) -> &str {
29        "Control a Playwright headless browser: navigate, screenshot, click, type, extract text, or evaluate JavaScript"
30    }
31
32    fn schema(&self) -> serde_json::Value {
33        browser_schema(&[
34            "navigate",
35            "screenshot",
36            "get_text",
37            "extract",
38            "click",
39            "type",
40            "press",
41            "evaluate",
42        ])
43    }
44
45    fn risk(&self) -> RiskLevel {
46        RiskLevel::Network
47    }
48
49    async fn call(&self, args: serde_json::Value, ctx: &ToolCtx) -> anyhow::Result<ToolResult> {
50        run_playwright(args, ctx, false).await
51    }
52}
53
54/// Computer-use primitive focused on screenshot/click/type/press.
55///
56/// This is intentionally separate from `browser`: it is classified as Exec so
57/// supervised/autonomous policy gates can treat UI-driving actions as stronger
58/// than passive web reads. On Linux hardened mode, the launched Node process is
59/// wrapped with `bwrap` when available.
60pub struct ComputerTool;
61
62#[async_trait]
63impl Tool for ComputerTool {
64    fn name(&self) -> &str {
65        "computer"
66    }
67
68    fn description(&self) -> &str {
69        "Computer-use actions through Playwright Chromium: screenshot, click, type, and key press, gated as sandboxed exec"
70    }
71
72    fn schema(&self) -> serde_json::Value {
73        browser_schema(&["screenshot", "click", "type", "press"])
74    }
75
76    fn risk(&self) -> RiskLevel {
77        RiskLevel::Exec
78    }
79
80    async fn call(&self, args: serde_json::Value, ctx: &ToolCtx) -> anyhow::Result<ToolResult> {
81        run_playwright(args, ctx, true).await
82    }
83}
84
85fn browser_schema(actions: &[&str]) -> serde_json::Value {
86    json!({
87        "type": "object",
88        "properties": {
89            "action": { "type": "string", "enum": actions },
90            "url": { "type": "string", "description": "URL to open before the action; defaults to about:blank" },
91            "selector": { "type": "string", "description": "CSS selector for click/type/extract or element screenshot" },
92            "x": { "type": "number", "description": "Viewport X coordinate for computer click/type actions" },
93            "y": { "type": "number", "description": "Viewport Y coordinate for computer click/type actions" },
94            "button": { "type": "string", "enum": ["left", "right", "middle"], "description": "Mouse button for coordinate click actions" },
95            "click_count": { "type": "integer", "description": "Number of clicks for coordinate click actions" },
96            "text": { "type": "string", "description": "Text for type/fill actions" },
97            "key": { "type": "string", "description": "Keyboard key or chord for press actions, e.g. Enter or Control+A" },
98            "js": { "type": "string", "description": "JavaScript expression/function body for evaluate" },
99            "timeout_ms": { "type": "integer", "description": "Timeout in milliseconds, default 30000" },
100            "wait_until": { "type": "string", "description": "Playwright navigation wait state, default networkidle" },
101            "headless": { "type": "boolean", "description": "Run Chromium headless unless false" },
102            "session_id": { "type": "string", "description": "Optional persistent browser session id for multi-step browser/computer-use" },
103            "full_page": { "type": "boolean", "description": "For screenshots, capture the full page unless false" },
104            "viewport": {
105                "type": "object",
106                "properties": {
107                    "width": { "type": "integer" },
108                    "height": { "type": "integer" },
109                    "deviceScaleFactor": { "type": "number" }
110                }
111            }
112        },
113        "required": ["action"]
114    })
115}
116
117async fn run_playwright(
118    mut args: serde_json::Value,
119    ctx: &ToolCtx,
120    require_computer_action: bool,
121) -> anyhow::Result<ToolResult> {
122    let action = args["action"].as_str().unwrap_or("navigate").to_string();
123    if require_computer_action
124        && !matches!(action.as_str(), "screenshot" | "click" | "type" | "press")
125    {
126        return Ok(ToolResult::error(format!(
127            "computer only supports screenshot, click, type, and press (got {})",
128            action
129        )));
130    }
131
132    if args.get("url").and_then(|v| v.as_str()).is_none()
133        && args.get("session_id").and_then(|v| v.as_str()).is_none()
134    {
135        args["url"] = json!("about:blank");
136    }
137
138    let driver_path = materialize_driver()?;
139    let workspace_root = ctx.workspace_root.clone();
140    let output = tokio::task::spawn_blocking(move || {
141        invoke_node_driver(&driver_path, &args, &workspace_root)
142    })
143    .await
144    .map_err(|e| anyhow::anyhow!("browser task join error: {}", e))??;
145
146    parse_driver_output(&action, &output)
147}
148
149fn materialize_driver() -> anyhow::Result<std::path::PathBuf> {
150    let dir = std::env::temp_dir().join("sparrow-playwright");
151    std::fs::create_dir_all(&dir)?;
152    let path = dir.join(format!("driver-{}.mjs", env!("CARGO_PKG_VERSION")));
153    if !path.exists() || std::fs::read_to_string(&path).ok().as_deref() != Some(PLAYWRIGHT_DRIVER) {
154        std::fs::write(&path, PLAYWRIGHT_DRIVER)?;
155    }
156    Ok(path)
157}
158
159fn invoke_node_driver(
160    driver_path: &std::path::Path,
161    args: &serde_json::Value,
162    workspace_root: &std::path::Path,
163) -> anyhow::Result<String> {
164    let mut child = build_command(driver_path, workspace_root)
165        .stdin(Stdio::piped())
166        .stdout(Stdio::piped())
167        .stderr(Stdio::piped())
168        .spawn()
169        .map_err(|e| {
170            anyhow::anyhow!(
171                "failed to launch Playwright driver via Node.js: {}. Install Node.js, then run `npm install` and `npx playwright install chromium`.",
172                e
173            )
174        })?;
175
176    if let Some(stdin) = child.stdin.as_mut() {
177        stdin.write_all(serde_json::to_string(args)?.as_bytes())?;
178    }
179
180    let output = child.wait_with_output()?;
181    if !output.status.success() {
182        anyhow::bail!(
183            "Playwright driver exited {}: {}",
184            output.status,
185            String::from_utf8_lossy(&output.stderr)
186        );
187    }
188    let stdout = String::from_utf8(output.stdout)?;
189    if stdout.trim().is_empty() {
190        anyhow::bail!(
191            "Playwright driver returned no output: {}",
192            String::from_utf8_lossy(&output.stderr)
193        );
194    }
195    Ok(stdout)
196}
197
198fn build_command(
199    driver_path: &std::path::Path,
200    workspace_root: &std::path::Path,
201) -> std::process::Command {
202    #[cfg(target_os = "linux")]
203    {
204        if std::env::var("SPARROW_BROWSER_BWRAP").ok().as_deref() != Some("0") && which("bwrap") {
205            let mut cmd = std::process::Command::new("bwrap");
206            cmd.arg("--die-with-parent")
207                .arg("--unshare-pid")
208                .arg("--proc")
209                .arg("/proc")
210                .arg("--dev")
211                .arg("/dev")
212                .arg("--tmpfs")
213                .arg("/tmp");
214            for path in ["/usr", "/bin", "/lib", "/lib64", "/etc"] {
215                add_ro_bind_if_exists(&mut cmd, path);
216            }
217            if let Some(home) = std::env::var_os("HOME") {
218                let cache = std::path::PathBuf::from(home)
219                    .join(".cache")
220                    .join("ms-playwright");
221                add_bind_if_exists(&mut cmd, &cache);
222            }
223            add_bind_if_exists(&mut cmd, &std::env::temp_dir());
224            cmd.arg("--bind")
225                .arg(workspace_root)
226                .arg(workspace_root)
227                .arg("--chdir")
228                .arg(workspace_root)
229                .arg("node")
230                .arg(driver_path);
231            return cmd;
232        }
233    }
234
235    let mut cmd = std::process::Command::new("node");
236    cmd.arg(driver_path).current_dir(workspace_root);
237    cmd
238}
239
240fn parse_driver_output(action: &str, stdout: &str) -> anyhow::Result<ToolResult> {
241    let value: serde_json::Value = serde_json::from_str(stdout.trim())?;
242    if value.get("ok").and_then(|v| v.as_bool()) != Some(true) {
243        let error = value
244            .get("error")
245            .and_then(|v| v.as_str())
246            .unwrap_or("Playwright driver failed");
247        let detail = value.get("detail").and_then(|v| v.as_str()).unwrap_or("");
248        return Ok(ToolResult::error(
249            format!("{}{}", error, if detail.is_empty() { "" } else { ": " }) + detail,
250        ));
251    }
252
253    if let Some(image) = value.get("image_base64").and_then(|v| v.as_str()) {
254        let data = base64::engine::general_purpose::STANDARD.decode(image)?;
255        return Ok(ToolResult::ok(vec![Block::Image {
256            data,
257            mime: value
258                .get("mime")
259                .and_then(|v| v.as_str())
260                .unwrap_or("image/png")
261                .to_string(),
262        }]));
263    }
264
265    if let Some(text) = value.get("text").and_then(|v| v.as_str()) {
266        return Ok(ToolResult::text(text.to_string()));
267    }
268
269    if let Some(result) = value.get("result") {
270        return Ok(ToolResult::text(result.to_string()));
271    }
272
273    let title = value.get("title").and_then(|v| v.as_str()).unwrap_or("");
274    let url = value.get("url").and_then(|v| v.as_str()).unwrap_or("");
275    Ok(ToolResult::text(format!(
276        "{} ok{}{}",
277        action,
278        if url.is_empty() { "" } else { " · " },
279        if title.is_empty() { url } else { title }
280    )))
281}
282
283#[cfg(target_os = "linux")]
284fn which(cmd: &str) -> bool {
285    std::process::Command::new("sh")
286        .arg("-c")
287        .arg(format!("command -v {}", cmd))
288        .output()
289        .map(|o| o.status.success())
290        .unwrap_or(false)
291}
292
293#[cfg(target_os = "linux")]
294fn add_ro_bind_if_exists(cmd: &mut std::process::Command, path: &str) {
295    let path = std::path::Path::new(path);
296    if path.exists() {
297        cmd.arg("--ro-bind").arg(path).arg(path);
298    }
299}
300
301#[cfg(target_os = "linux")]
302fn add_bind_if_exists(cmd: &mut std::process::Command, path: &std::path::Path) {
303    if path.exists() {
304        cmd.arg("--bind").arg(path).arg(path);
305    }
306}
307
308#[cfg(test)]
309mod tests {
310    use super::*;
311
312    #[test]
313    fn parses_screenshot_payload_as_image_block() {
314        let out = json!({
315            "ok": true,
316            "mime": "image/png",
317            "image_base64": base64::engine::general_purpose::STANDARD.encode([1_u8, 2, 3]),
318        });
319        let result = parse_driver_output("screenshot", &out.to_string()).unwrap();
320        assert!(!result.is_error);
321        assert!(matches!(
322            result.content.as_slice(),
323            [Block::Image { mime, data }] if mime == "image/png" && data == &vec![1, 2, 3]
324        ));
325    }
326
327    #[test]
328    fn computer_rejects_non_computer_actions_before_driver_launch() {
329        let rt = tokio::runtime::Runtime::new().unwrap();
330        let result = rt.block_on(async {
331            ComputerTool
332                .call(
333                    json!({"action": "navigate", "url": "https://example.com"}),
334                    &ToolCtx {
335                        workspace_root: std::env::current_dir().unwrap(),
336                        run_id: crate::event::RunId("test".into()),
337                    },
338                )
339                .await
340                .unwrap()
341        });
342        assert!(result.is_error);
343    }
344
345    #[test]
346    fn computer_schema_exposes_coordinate_session_and_press_controls() {
347        let schema = ComputerTool.schema();
348        let actions = schema["properties"]["action"]["enum"]
349            .as_array()
350            .expect("action enum");
351        assert!(actions.iter().any(|v| v == "press"));
352        assert!(schema["properties"].get("x").is_some());
353        assert!(schema["properties"].get("y").is_some());
354        assert!(schema["properties"].get("session_id").is_some());
355        assert!(schema["properties"].get("key").is_some());
356    }
357
358    #[test]
359    fn embedded_driver_resolves_playwright_from_workspace() {
360        assert!(PLAYWRIGHT_DRIVER.contains("createRequire"));
361        assert!(PLAYWRIGHT_DRIVER.contains("process.cwd()"));
362        assert!(PLAYWRIGHT_DRIVER.contains("SPARROW_PLAYWRIGHT_ROOT"));
363        assert!(!PLAYWRIGHT_DRIVER.contains("from \"playwright\""));
364    }
365}