mermaid_cli/providers/tool/computer_use/mod.rs
1//! Computer-use tools — screenshot capture, mouse + keyboard control.
2//!
3//! Seven tools total (`screenshot`, `click`, `type_text`, `press_key`,
4//! `scroll`, `mouse_move`, `list_windows`) share one `ComputerUseDriver`.
5//! The driver owns the platform-specific subprocess dispatch (scrot/
6//! xdotool on X11, grim/ydotool on Wayland, screencapture/cliclick on
7//! macOS) and the `ScreenshotRegistry` — a small LRU buffer of recent
8//! capture metadata so the model can pass `screenshot_id` on
9//! `click`/`mouse_move` to lock coordinates to a specific capture.
10//!
11//! Registration is gated two ways:
12//! - `TuiMode::Headless` (`mermaid run <prompt>`) never registers any
13//! computer-use tool regardless of what the display probes say —
14//! a CI job has no user to watch a screenshot.
15//! - `Backend::probe()` runs an eager capability check at startup
16//! (env vars + required binaries + `xdpyinfo` smoke test). If the
17//! result is `Unsupported`, no tools register.
18//!
19//! The driver ALSO exposes `ensure_alive()` which every tool calls at
20//! the top of `execute`. It's a cheap re-probe that catches the
21//! "`DISPLAY=:0` ghost" case: env looks right, binaries exist, but
22//! the X server is actually unreachable (SSH forwarding without an
23//! X server, detached display, laptop lid closed).
24
25pub mod click;
26pub mod driver;
27pub mod list_windows;
28pub mod mouse_move;
29pub mod press_key;
30pub mod screenshot;
31pub mod scroll;
32pub mod type_text;
33
34use std::path::Path;
35use std::process::Command;
36
37use serde_json::Value;
38
39use crate::domain::{ToolMetadata, ToolOutcome, ToolRunMetadata};
40
41pub use click::ClickTool;
42pub use driver::ComputerUseDriver;
43pub use list_windows::ListWindowsTool;
44pub use mouse_move::MouseMoveTool;
45pub use press_key::PressKeyTool;
46pub use screenshot::ScreenshotTool;
47pub use scroll::ScrollTool;
48pub use type_text::TypeTextTool;
49
50/// Platform / display-server the driver dispatches to.
51#[derive(Debug, Clone, Copy, PartialEq, Eq)]
52pub enum Backend {
53 X11,
54 Wayland,
55 MacOS,
56 Windows,
57 Unsupported,
58}
59
60impl Backend {
61 /// Whether the driver has any tools it can run on this backend.
62 pub fn is_usable(self) -> bool {
63 !matches!(self, Backend::Unsupported)
64 }
65}
66
67/// Eager probe. Runs at startup to decide registration — does the
68/// right binary exist? Is the display reachable? Returns
69/// `Backend::Unsupported` when mermaid can't drive the display even
70/// though env vars might suggest otherwise (e.g. SSH forwarding).
71pub fn probe() -> Backend {
72 if cfg!(target_os = "macos") {
73 if has_command("screencapture") {
74 return Backend::MacOS;
75 }
76 return Backend::Unsupported;
77 }
78 if cfg!(target_os = "windows") {
79 // Windows backend is a v0.6 stub — not wired here. Once a
80 // real impl lands, probe PowerShell / SendInput here.
81 return Backend::Unsupported;
82 }
83
84 // Linux: try Wayland first (prefer if both are set).
85 if std::env::var("WAYLAND_DISPLAY").is_ok()
86 && has_command("grim")
87 && (has_command("ydotool") || has_command("wtype"))
88 {
89 return Backend::Wayland;
90 }
91
92 // Linux: fall back to X11. The xdpyinfo probe catches the ghost
93 // case — DISPLAY is set but no X server responds (common over
94 // SSH without X forwarding, or after a stale SSH reconnect).
95 if std::env::var("DISPLAY").is_ok()
96 && has_command("scrot")
97 && has_command("xdotool")
98 && xdpyinfo_alive()
99 {
100 return Backend::X11;
101 }
102
103 Backend::Unsupported
104}
105
106/// Quick re-probe used by `ComputerUseDriver::ensure_alive`. Cheaper
107/// than the full `probe()` — just checks the display answers — so
108/// every tool call can afford it.
109pub fn display_is_reachable(backend: Backend) -> bool {
110 match backend {
111 Backend::X11 => xdpyinfo_alive(),
112 Backend::Wayland => std::env::var("WAYLAND_DISPLAY").is_ok(),
113 Backend::MacOS | Backend::Windows => true,
114 Backend::Unsupported => false,
115 }
116}
117
118pub(super) fn has_command(name: &str) -> bool {
119 // `which` returns 0 iff the binary is on PATH. Cheap and universal
120 // across Linux + macOS; Windows would want `where.exe` but
121 // computer-use on Windows is stubbed out anyway.
122 Command::new("which")
123 .arg(name)
124 .output()
125 .map(|o| o.status.success() && !o.stdout.is_empty())
126 .unwrap_or(false)
127}
128
129/// Exit-0 check on `xdpyinfo` with a 200ms timeout. This is the
130/// difference between "`DISPLAY` is set" and "an X server will
131/// actually answer us."
132fn xdpyinfo_alive() -> bool {
133 if !has_command("xdpyinfo") {
134 // Some minimal X setups don't ship xdpyinfo. Fall back to a
135 // `xdotool getactivewindow` probe (we already require
136 // xdotool for clicks anyway).
137 return Command::new("xdotool")
138 .arg("getactivewindow")
139 .output()
140 .map(|o| o.status.success())
141 .unwrap_or(false);
142 }
143 // Use a timeout wrapper so a wedged display doesn't hang startup.
144 match Command::new("timeout").arg("0.2").arg("xdpyinfo").output() {
145 Ok(o) => o.status.success(),
146 Err(_) => {
147 // `timeout` not available (macOS older versions). Fall
148 // back to a direct call — shouldn't happen on Linux X11.
149 Command::new("xdpyinfo")
150 .output()
151 .map(|o| o.status.success())
152 .unwrap_or(false)
153 },
154 }
155}
156
157/// Utility: strip to filename-safe path for temp files.
158#[allow(dead_code)]
159pub(crate) fn path_stem(p: &Path) -> String {
160 p.file_stem()
161 .and_then(|s| s.to_str())
162 .map(|s| s.to_string())
163 .unwrap_or_else(|| "unknown".to_string())
164}
165
166pub(super) fn computer_use_success(
167 action: &'static str,
168 params: Value,
169 output: String,
170 duration_secs: f64,
171) -> ToolOutcome {
172 ToolOutcome::success(output, format!("{} completed", action), duration_secs).with_metadata(
173 ToolRunMetadata {
174 detail: ToolMetadata::ComputerUse {
175 action: action.to_string(),
176 params,
177 },
178 ..ToolRunMetadata::default()
179 },
180 )
181}
182
183#[cfg(test)]
184mod tests {
185 use super::*;
186
187 #[test]
188 fn backend_unsupported_is_not_usable() {
189 assert!(!Backend::Unsupported.is_usable());
190 assert!(Backend::X11.is_usable());
191 assert!(Backend::Wayland.is_usable());
192 assert!(Backend::MacOS.is_usable());
193 }
194
195 #[test]
196 fn probe_does_not_panic_on_headless() {
197 // In the test runner (no DISPLAY, no WAYLAND_DISPLAY on most
198 // CI envs), probe() must return Unsupported without panicking.
199 // We don't assert a specific result because dev machines may
200 // have a live display.
201 let _ = probe();
202 }
203}