Skip to main content

mermaid_cli/providers/tool/computer_use/
screenshot.rs

1//! `screenshot` tool — capture the display (or a slice of it) and
2//! return the image for the model to reason about.
3//!
4//! Accepted modes: `fullscreen` (default), `focused`, `monitor`,
5//! `region`, `window`. Coordinate metadata is registered in the
6//! driver's `ScreenshotRegistry` so later `click` / `mouse_move`
7//! calls can quote `screenshot_id` to lock their coordinates to this
8//! specific capture.
9
10use std::sync::Arc;
11use std::time::Instant;
12
13use async_trait::async_trait;
14use serde_json::Value;
15
16use crate::domain::{ToolDefinition, ToolOutcome};
17use crate::providers::ctx::{ExecContext, ProgressEvent};
18
19use super::super::ToolExecutor;
20use super::computer_use_success;
21use super::driver::{ComputerUseDriver, ScreenshotSpec};
22
23pub struct ScreenshotTool {
24    driver: Arc<ComputerUseDriver>,
25}
26
27impl ScreenshotTool {
28    pub fn new(driver: Arc<ComputerUseDriver>) -> Self {
29        Self { driver }
30    }
31}
32
33#[async_trait]
34impl ToolExecutor for ScreenshotTool {
35    fn name(&self) -> &'static str {
36        "screenshot"
37    }
38
39    fn schema(&self) -> ToolDefinition {
40        ToolDefinition {
41            name: "screenshot".to_string(),
42            description: "Capture the display and return it as a base64 PNG plus a screenshot id. \
43                 Modes: 'fullscreen' (default), 'focused' (active window, X11/macOS only), \
44                 'monitor' (pass `monitor` with an output name from xrandr), 'region' \
45                 (pass `region` as 'X,Y,WIDTHxHEIGHT'), 'window' (pass `window` with a \
46                 title substring, X11 only). The returned id can be passed as `screenshot_id` \
47                 on later `click` / `mouse_move` calls so coordinates are translated using \
48                 the right scale + offset."
49                .to_string(),
50            input_schema: serde_json::json!({
51                "type": "object",
52                "properties": {
53                    "mode": { "type": "string", "enum": ["fullscreen", "focused", "monitor", "region", "window"], "default": "fullscreen" },
54                    "monitor": { "type": "string", "description": "Output name from xrandr --query (required for mode='monitor')" },
55                    "region": { "type": "string", "description": "'X,Y,WIDTHxHEIGHT' (required for mode='region')" },
56                    "window": { "type": "string", "description": "Window title substring (required for mode='window', X11 only)" }
57                }
58            }),
59        }
60    }
61
62    async fn execute(&self, args: Value, ctx: ExecContext) -> ToolOutcome {
63        let started = Instant::now();
64
65        if let Err(error) = self.driver.ensure_alive() {
66            return ToolOutcome::error(error, started.elapsed().as_secs_f64());
67        }
68
69        let spec = match parse_spec(&args) {
70            Ok(s) => s,
71            Err(e) => return ToolOutcome::error(e, started.elapsed().as_secs_f64()),
72        };
73
74        let result = tokio::select! {
75            biased;
76            _ = ctx.token.cancelled() => return ToolOutcome::cancelled(),
77            r = self.driver.capture(spec, &ctx.token) => r,
78        };
79
80        let cap = match result {
81            Ok(c) => c,
82            Err(e) => {
83                return ToolOutcome::error(
84                    format!("screenshot capture failed: {}", e),
85                    started.elapsed().as_secs_f64(),
86                );
87            },
88        };
89
90        // Emit an inline preview via the progress channel. The reducer
91        // routes `image/*` artifacts onto the active assistant
92        // message's `images` for immediate chat display; the final
93        // outcome carries the same base64 so conversation save/load
94        // keeps the screenshot in history. Dedupe between the two
95        // paths is owned by the chat widget (Commit 6).
96        let _ = ctx
97            .progress
98            .send(ProgressEvent::Artifact {
99                mime: "image/png".to_string(),
100                data: cap.raw_bytes,
101                caption: Some(format!("screenshot #{}", cap.id)),
102            })
103            .await;
104
105        computer_use_success(
106            "screenshot",
107            args,
108            cap.summary,
109            started.elapsed().as_secs_f64(),
110        )
111        .with_images(vec![cap.base64_png])
112    }
113}
114
115fn parse_spec(args: &Value) -> Result<ScreenshotSpec, String> {
116    let mode = args
117        .get("mode")
118        .and_then(|v| v.as_str())
119        .unwrap_or("fullscreen")
120        .to_lowercase();
121    match mode.as_str() {
122        "fullscreen" | "" => Ok(ScreenshotSpec::Fullscreen),
123        "focused" => Ok(ScreenshotSpec::Focused),
124        "monitor" => {
125            let name = args
126                .get("monitor")
127                .and_then(|v| v.as_str())
128                .ok_or_else(|| {
129                    "Monitor name required for mode='monitor'. Use `xrandr --query` to list."
130                        .to_string()
131                })?
132                .to_string();
133            Ok(ScreenshotSpec::Monitor(name))
134        },
135        "region" => {
136            let region = args.get("region").and_then(|v| v.as_str()).ok_or_else(|| {
137                "Region required for mode='region'. Format: 'X,Y,WIDTHxHEIGHT'".to_string()
138            })?;
139            parse_region(region)
140        },
141        "window" => {
142            let title = args
143                .get("window")
144                .and_then(|v| v.as_str())
145                .ok_or_else(|| {
146                    "Window title required for mode='window'. Use list_windows first.".to_string()
147                })?
148                .to_string();
149            Ok(ScreenshotSpec::Window(title))
150        },
151        other => Err(format!(
152            "Unknown screenshot mode '{}'. Valid: fullscreen|focused|monitor|region|window",
153            other
154        )),
155    }
156}
157
158fn parse_region(s: &str) -> Result<ScreenshotSpec, String> {
159    // 'X,Y,WIDTHxHEIGHT' e.g. '100,50,800x600'
160    let parts: Vec<&str> = s.splitn(3, ',').collect();
161    if parts.len() != 3 {
162        return Err(format!(
163            "Invalid region '{}'. Expected 'X,Y,WIDTHxHEIGHT' (e.g. '100,50,800x600').",
164            s
165        ));
166    }
167    let x: i32 = parts[0]
168        .parse()
169        .map_err(|_| format!("Invalid X in region '{}'", s))?;
170    let y: i32 = parts[1]
171        .parse()
172        .map_err(|_| format!("Invalid Y in region '{}'", s))?;
173    let (w, h) = parts[2]
174        .split_once('x')
175        .ok_or_else(|| format!("Invalid WIDTHxHEIGHT in region '{}'", s))?;
176    let width: u32 = w
177        .parse()
178        .map_err(|_| format!("Invalid width in region '{}'", s))?;
179    let height: u32 = h
180        .parse()
181        .map_err(|_| format!("Invalid height in region '{}'", s))?;
182    Ok(ScreenshotSpec::Region(x, y, width, height))
183}
184
185#[cfg(test)]
186mod tests {
187    use super::*;
188
189    #[test]
190    fn parse_spec_defaults_to_fullscreen() {
191        let s = parse_spec(&serde_json::json!({})).unwrap();
192        assert!(matches!(s, ScreenshotSpec::Fullscreen));
193    }
194
195    #[test]
196    fn parse_spec_region_parses_valid() {
197        let s =
198            parse_spec(&serde_json::json!({"mode": "region", "region": "10,20,800x600"})).unwrap();
199        match s {
200            ScreenshotSpec::Region(x, y, w, h) => {
201                assert_eq!((x, y, w, h), (10, 20, 800, 600));
202            },
203            _ => panic!("expected Region"),
204        }
205    }
206
207    #[test]
208    fn parse_spec_region_rejects_malformed() {
209        let err = parse_spec(&serde_json::json!({"mode": "region", "region": "oops"})).unwrap_err();
210        assert!(err.contains("Expected"));
211    }
212
213    #[test]
214    fn parse_spec_monitor_requires_name() {
215        let err = parse_spec(&serde_json::json!({"mode": "monitor"})).unwrap_err();
216        assert!(err.contains("Monitor name"));
217    }
218
219    #[test]
220    fn parse_spec_mode_is_case_insensitive() {
221        let s = parse_spec(&serde_json::json!({"mode": "FullScreen"})).unwrap();
222        assert!(matches!(s, ScreenshotSpec::Fullscreen));
223    }
224}