mermaid_cli/providers/tool/computer_use/
screenshot.rs1use std::sync::Arc;
11use std::time::Instant;
12
13use async_trait::async_trait;
14use serde_json::Value;
15
16use crate::domain::{ToolDefinition, ToolOutcome};
17use crate::providers::ctx::{ExecContext, ProgressEvent};
18
19use super::super::ToolExecutor;
20use super::computer_use_success;
21use super::driver::{ComputerUseDriver, ScreenshotSpec};
22
23pub struct ScreenshotTool {
24 driver: Arc<ComputerUseDriver>,
25}
26
27impl ScreenshotTool {
28 pub fn new(driver: Arc<ComputerUseDriver>) -> Self {
29 Self { driver }
30 }
31}
32
33#[async_trait]
34impl ToolExecutor for ScreenshotTool {
35 fn name(&self) -> &'static str {
36 "screenshot"
37 }
38
39 fn schema(&self) -> ToolDefinition {
40 ToolDefinition {
41 name: "screenshot".to_string(),
42 description: "Capture the display and return it as a base64 PNG plus a screenshot id. \
43 Modes: 'fullscreen' (default), 'focused' (active window, X11/macOS only), \
44 'monitor' (pass `monitor` with an output name from xrandr), 'region' \
45 (pass `region` as 'X,Y,WIDTHxHEIGHT'), 'window' (pass `window` with a \
46 title substring, X11 only). The returned id can be passed as `screenshot_id` \
47 on later `click` / `mouse_move` calls so coordinates are translated using \
48 the right scale + offset."
49 .to_string(),
50 input_schema: serde_json::json!({
51 "type": "object",
52 "properties": {
53 "mode": { "type": "string", "enum": ["fullscreen", "focused", "monitor", "region", "window"], "default": "fullscreen" },
54 "monitor": { "type": "string", "description": "Output name from xrandr --query (required for mode='monitor')" },
55 "region": { "type": "string", "description": "'X,Y,WIDTHxHEIGHT' (required for mode='region')" },
56 "window": { "type": "string", "description": "Window title substring (required for mode='window', X11 only)" }
57 }
58 }),
59 }
60 }
61
62 async fn execute(&self, args: Value, ctx: ExecContext) -> ToolOutcome {
63 let started = Instant::now();
64
65 if let Err(error) = self.driver.ensure_alive() {
66 return ToolOutcome::error(error, started.elapsed().as_secs_f64());
67 }
68
69 let spec = match parse_spec(&args) {
70 Ok(s) => s,
71 Err(e) => return ToolOutcome::error(e, started.elapsed().as_secs_f64()),
72 };
73
74 let result = tokio::select! {
75 biased;
76 _ = ctx.token.cancelled() => return ToolOutcome::cancelled(),
77 r = self.driver.capture(spec, &ctx.token) => r,
78 };
79
80 let cap = match result {
81 Ok(c) => c,
82 Err(e) => {
83 return ToolOutcome::error(
84 format!("screenshot capture failed: {}", e),
85 started.elapsed().as_secs_f64(),
86 );
87 },
88 };
89
90 let _ = ctx
97 .progress
98 .send(ProgressEvent::Artifact {
99 mime: "image/png".to_string(),
100 data: cap.raw_bytes,
101 caption: Some(format!("screenshot #{}", cap.id)),
102 })
103 .await;
104
105 computer_use_success(
106 "screenshot",
107 args,
108 cap.summary,
109 started.elapsed().as_secs_f64(),
110 )
111 .with_images(vec![cap.base64_png])
112 }
113}
114
115fn parse_spec(args: &Value) -> Result<ScreenshotSpec, String> {
116 let mode = args
117 .get("mode")
118 .and_then(|v| v.as_str())
119 .unwrap_or("fullscreen")
120 .to_lowercase();
121 match mode.as_str() {
122 "fullscreen" | "" => Ok(ScreenshotSpec::Fullscreen),
123 "focused" => Ok(ScreenshotSpec::Focused),
124 "monitor" => {
125 let name = args
126 .get("monitor")
127 .and_then(|v| v.as_str())
128 .ok_or_else(|| {
129 "Monitor name required for mode='monitor'. Use `xrandr --query` to list."
130 .to_string()
131 })?
132 .to_string();
133 Ok(ScreenshotSpec::Monitor(name))
134 },
135 "region" => {
136 let region = args.get("region").and_then(|v| v.as_str()).ok_or_else(|| {
137 "Region required for mode='region'. Format: 'X,Y,WIDTHxHEIGHT'".to_string()
138 })?;
139 parse_region(region)
140 },
141 "window" => {
142 let title = args
143 .get("window")
144 .and_then(|v| v.as_str())
145 .ok_or_else(|| {
146 "Window title required for mode='window'. Use list_windows first.".to_string()
147 })?
148 .to_string();
149 Ok(ScreenshotSpec::Window(title))
150 },
151 other => Err(format!(
152 "Unknown screenshot mode '{}'. Valid: fullscreen|focused|monitor|region|window",
153 other
154 )),
155 }
156}
157
158fn parse_region(s: &str) -> Result<ScreenshotSpec, String> {
159 let parts: Vec<&str> = s.splitn(3, ',').collect();
161 if parts.len() != 3 {
162 return Err(format!(
163 "Invalid region '{}'. Expected 'X,Y,WIDTHxHEIGHT' (e.g. '100,50,800x600').",
164 s
165 ));
166 }
167 let x: i32 = parts[0]
168 .parse()
169 .map_err(|_| format!("Invalid X in region '{}'", s))?;
170 let y: i32 = parts[1]
171 .parse()
172 .map_err(|_| format!("Invalid Y in region '{}'", s))?;
173 let (w, h) = parts[2]
174 .split_once('x')
175 .ok_or_else(|| format!("Invalid WIDTHxHEIGHT in region '{}'", s))?;
176 let width: u32 = w
177 .parse()
178 .map_err(|_| format!("Invalid width in region '{}'", s))?;
179 let height: u32 = h
180 .parse()
181 .map_err(|_| format!("Invalid height in region '{}'", s))?;
182 Ok(ScreenshotSpec::Region(x, y, width, height))
183}
184
185#[cfg(test)]
186mod tests {
187 use super::*;
188
189 #[test]
190 fn parse_spec_defaults_to_fullscreen() {
191 let s = parse_spec(&serde_json::json!({})).unwrap();
192 assert!(matches!(s, ScreenshotSpec::Fullscreen));
193 }
194
195 #[test]
196 fn parse_spec_region_parses_valid() {
197 let s =
198 parse_spec(&serde_json::json!({"mode": "region", "region": "10,20,800x600"})).unwrap();
199 match s {
200 ScreenshotSpec::Region(x, y, w, h) => {
201 assert_eq!((x, y, w, h), (10, 20, 800, 600));
202 },
203 _ => panic!("expected Region"),
204 }
205 }
206
207 #[test]
208 fn parse_spec_region_rejects_malformed() {
209 let err = parse_spec(&serde_json::json!({"mode": "region", "region": "oops"})).unwrap_err();
210 assert!(err.contains("Expected"));
211 }
212
213 #[test]
214 fn parse_spec_monitor_requires_name() {
215 let err = parse_spec(&serde_json::json!({"mode": "monitor"})).unwrap_err();
216 assert!(err.contains("Monitor name"));
217 }
218
219 #[test]
220 fn parse_spec_mode_is_case_insensitive() {
221 let s = parse_spec(&serde_json::json!({"mode": "FullScreen"})).unwrap();
222 assert!(matches!(s, ScreenshotSpec::Fullscreen));
223 }
224}