Skip to main content

mermaid_cli/providers/tool/computer_use/
mod.rs

1//! Computer-use tools — screenshot capture, mouse + keyboard control.
2//!
3//! Seven tools total (`screenshot`, `click`, `type_text`, `press_key`,
4//! `scroll`, `mouse_move`, `list_windows`) share one `ComputerUseDriver`.
5//! The driver owns the platform-specific subprocess dispatch (scrot/
6//! xdotool on X11, grim/ydotool on Wayland, screencapture/cliclick on
7//! macOS) and the `ScreenshotRegistry` — a small LRU buffer of recent
8//! capture metadata so the model can pass `screenshot_id` on
9//! `click`/`mouse_move` to lock coordinates to a specific capture.
10//!
11//! Registration is gated two ways:
12//! - `TuiMode::Headless` (`mermaid run <prompt>`) never registers any
13//!   computer-use tool regardless of what the display probes say —
14//!   a CI job has no user to watch a screenshot.
15//! - `Backend::probe()` runs an eager capability check at startup
16//!   (env vars + required binaries + `xdpyinfo` smoke test). If the
17//!   result is `Unsupported`, no tools register.
18//!
19//! The driver ALSO exposes `ensure_alive()` which every tool calls at
20//! the top of `execute`. It's a cheap re-probe that catches the
21//! "`DISPLAY=:0` ghost" case: env looks right, binaries exist, but
22//! the X server is actually unreachable (SSH forwarding without an
23//! X server, detached display, laptop lid closed).
24
25pub mod click;
26pub mod driver;
27pub mod list_windows;
28pub mod mouse_move;
29pub mod press_key;
30pub mod screenshot;
31pub mod scroll;
32pub mod type_text;
33
34use std::path::Path;
35use std::process::Command;
36
37use serde_json::Value;
38
39use crate::domain::{ToolMetadata, ToolOutcome, ToolRunMetadata};
40
41pub use click::ClickTool;
42pub use driver::ComputerUseDriver;
43pub use list_windows::ListWindowsTool;
44pub use mouse_move::MouseMoveTool;
45pub use press_key::PressKeyTool;
46pub use screenshot::ScreenshotTool;
47pub use scroll::ScrollTool;
48pub use type_text::TypeTextTool;
49
50/// Platform / display-server the driver dispatches to.
51#[derive(Debug, Clone, Copy, PartialEq, Eq)]
52pub enum Backend {
53    X11,
54    Wayland,
55    MacOS,
56    Windows,
57    Unsupported,
58}
59
60impl Backend {
61    /// Whether the driver has any tools it can run on this backend.
62    pub fn is_usable(self) -> bool {
63        !matches!(self, Backend::Unsupported)
64    }
65}
66
67/// Eager probe. Runs at startup to decide registration — does the
68/// right binary exist? Is the display reachable? Returns
69/// `Backend::Unsupported` when mermaid can't drive the display even
70/// though env vars might suggest otherwise (e.g. SSH forwarding).
71pub fn probe() -> Backend {
72    if cfg!(target_os = "macos") {
73        if has_command("screencapture") {
74            return Backend::MacOS;
75        }
76        return Backend::Unsupported;
77    }
78    if cfg!(target_os = "windows") {
79        // Windows backend is a v0.6 stub — not wired here. Once a
80        // real impl lands, probe PowerShell / SendInput here.
81        return Backend::Unsupported;
82    }
83
84    // Linux: try Wayland first (prefer if both are set).
85    if std::env::var("WAYLAND_DISPLAY").is_ok()
86        && has_command("grim")
87        && (has_command("ydotool") || has_command("wtype"))
88    {
89        return Backend::Wayland;
90    }
91
92    // Linux: fall back to X11. The xdpyinfo probe catches the ghost
93    // case — DISPLAY is set but no X server responds (common over
94    // SSH without X forwarding, or after a stale SSH reconnect).
95    if std::env::var("DISPLAY").is_ok()
96        && has_command("scrot")
97        && has_command("xdotool")
98        && xdpyinfo_alive()
99    {
100        return Backend::X11;
101    }
102
103    Backend::Unsupported
104}
105
106/// Quick re-probe used by `ComputerUseDriver::ensure_alive`. Cheaper
107/// than the full `probe()` — just checks the display answers — so
108/// every tool call can afford it.
109pub fn display_is_reachable(backend: Backend) -> bool {
110    match backend {
111        Backend::X11 => xdpyinfo_alive(),
112        Backend::Wayland => std::env::var("WAYLAND_DISPLAY").is_ok(),
113        Backend::MacOS | Backend::Windows => true,
114        Backend::Unsupported => false,
115    }
116}
117
118pub(super) fn has_command(name: &str) -> bool {
119    // `which` returns 0 iff the binary is on PATH. Cheap and universal
120    // across Linux + macOS; Windows would want `where.exe` but
121    // computer-use on Windows is stubbed out anyway.
122    Command::new("which")
123        .arg(name)
124        .output()
125        .map(|o| o.status.success() && !o.stdout.is_empty())
126        .unwrap_or(false)
127}
128
129/// Exit-0 check on `xdpyinfo` with a 200ms timeout. This is the
130/// difference between "`DISPLAY` is set" and "an X server will
131/// actually answer us."
132fn xdpyinfo_alive() -> bool {
133    if !has_command("xdpyinfo") {
134        // Some minimal X setups don't ship xdpyinfo. Fall back to a
135        // `xdotool getactivewindow` probe (we already require
136        // xdotool for clicks anyway).
137        return Command::new("xdotool")
138            .arg("getactivewindow")
139            .output()
140            .map(|o| o.status.success())
141            .unwrap_or(false);
142    }
143    // Use a timeout wrapper so a wedged display doesn't hang startup.
144    match Command::new("timeout").arg("0.2").arg("xdpyinfo").output() {
145        Ok(o) => o.status.success(),
146        Err(_) => {
147            // `timeout` not available (macOS older versions). Fall
148            // back to a direct call — shouldn't happen on Linux X11.
149            Command::new("xdpyinfo")
150                .output()
151                .map(|o| o.status.success())
152                .unwrap_or(false)
153        },
154    }
155}
156
157/// Utility: strip to filename-safe path for temp files.
158#[allow(dead_code)]
159pub(crate) fn path_stem(p: &Path) -> String {
160    p.file_stem()
161        .and_then(|s| s.to_str())
162        .map(|s| s.to_string())
163        .unwrap_or_else(|| "unknown".to_string())
164}
165
166pub(super) fn computer_use_success(
167    action: &'static str,
168    params: Value,
169    output: String,
170    duration_secs: f64,
171) -> ToolOutcome {
172    ToolOutcome::success(output, format!("{} completed", action), duration_secs).with_metadata(
173        ToolRunMetadata {
174            detail: ToolMetadata::ComputerUse {
175                action: action.to_string(),
176                params,
177            },
178            ..ToolRunMetadata::default()
179        },
180    )
181}
182
183#[cfg(test)]
184mod tests {
185    use super::*;
186
187    #[test]
188    fn backend_unsupported_is_not_usable() {
189        assert!(!Backend::Unsupported.is_usable());
190        assert!(Backend::X11.is_usable());
191        assert!(Backend::Wayland.is_usable());
192        assert!(Backend::MacOS.is_usable());
193    }
194
195    #[test]
196    fn probe_does_not_panic_on_headless() {
197        // In the test runner (no DISPLAY, no WAYLAND_DISPLAY on most
198        // CI envs), probe() must return Unsupported without panicking.
199        // We don't assert a specific result because dev machines may
200        // have a live display.
201        let _ = probe();
202    }
203}