Skip to main content

mermaid_cli/providers/tool/computer_use/
driver.rs

1//! `ComputerUseDriver` — the shared backend-dispatch layer for the
2//! seven computer-use tools.
3//!
4//! The driver wraps three things:
5//!
6//!   1. A `Backend` discriminant (`X11`, `Wayland`, `MacOS`, …). Tools
7//!      match on it to pick the right subprocess dispatch.
8//!   2. A bounded `ScreenshotRegistry`. Every capture gets a stable
9//!      `id`; the model includes that id on later `click(x, y,
10//!      screenshot_id)` so coordinate translation uses the right
11//!      scale+offset even if the newest screenshot has shifted the
12//!      "latest" entry.
13//!   3. `ensure_alive()` — a cheap re-probe called at the top of every
14//!      tool's `execute`. Catches the case where the display went
15//!      away between registration and invocation (detached SSH,
16//!      closed lid).
17//!
18//! Subprocess dispatch uses `tokio::process::Command` so each external
19//! binary can race against `ctx.token.cancelled()`. `kill_on_drop(true)`
20//! reaps children whose parent future gets cancelled.
21
22use std::collections::VecDeque;
23use std::path::PathBuf;
24use std::sync::Mutex;
25use std::sync::atomic::{AtomicU64, Ordering};
26
27use anyhow::{Context, Result};
28use base64::{Engine as _, engine::general_purpose};
29use tokio::process::Command;
30use tokio_util::sync::CancellationToken;
31
32use crate::constants::{SCREENSHOT_MAX_WIDTH, SCREENSHOT_REGISTRY_CAPACITY};
33
34use super::Backend;
35
36/// Per-capture metadata retained so subsequent clicks can translate
37/// model-space coords back to screen-space.
38#[derive(Debug, Clone)]
39pub struct ScreenshotMetadata {
40    pub id: u64,
41    pub scale_factor: f64,
42    pub offset_x: i32,
43    pub offset_y: i32,
44    /// Human-readable capture kind (`"fullscreen"`, `"focused window"`,
45    /// …). Surfaced in error messages if the model references an
46    /// evicted id.
47    pub kind: String,
48}
49
50/// Bounded ring buffer of recent screenshot metadata. Capacity from
51/// `constants::SCREENSHOT_REGISTRY_CAPACITY` (= 16). When full, push
52/// evicts the oldest; referencing an evicted id fails cleanly with
53/// "take a fresh screenshot" rather than silently clicking at wrong
54/// coordinates.
55#[derive(Debug, Default)]
56pub struct ScreenshotRegistry {
57    entries: VecDeque<ScreenshotMetadata>,
58}
59
60impl ScreenshotRegistry {
61    pub fn new() -> Self {
62        Self {
63            entries: VecDeque::new(),
64        }
65    }
66
67    pub fn push(&mut self, meta: ScreenshotMetadata) {
68        if self.entries.len() >= SCREENSHOT_REGISTRY_CAPACITY {
69            self.entries.pop_front();
70        }
71        self.entries.push_back(meta);
72    }
73
74    pub fn get(&self, id: u64) -> Option<&ScreenshotMetadata> {
75        self.entries.iter().find(|m| m.id == id)
76    }
77
78    pub fn latest(&self) -> Option<&ScreenshotMetadata> {
79        self.entries.back()
80    }
81
82    pub fn len(&self) -> usize {
83        self.entries.len()
84    }
85
86    pub fn is_empty(&self) -> bool {
87        self.entries.is_empty()
88    }
89}
90
91/// What the screenshot tool accepts: which slice of the display to
92/// capture.
93#[derive(Debug, Clone)]
94pub enum ScreenshotSpec {
95    Fullscreen,
96    Focused,
97    Monitor(String),
98    /// `(x, y, width, height)` in screen pixels.
99    Region(i32, i32, u32, u32),
100    Window(String),
101}
102
103/// Result of a capture: encoded bytes + registry id + a human-readable
104/// summary for the tool's `output` field.
105#[derive(Debug)]
106pub struct CaptureResult {
107    pub id: u64,
108    pub base64_png: String,
109    pub raw_bytes: Vec<u8>,
110    pub width: u32,
111    pub height: u32,
112    pub scale_factor: f64,
113    pub offset_x: i32,
114    pub offset_y: i32,
115    pub summary: String,
116}
117
118/// Shared driver all seven computer-use tools hold an `Arc<>` to.
119pub struct ComputerUseDriver {
120    backend: Backend,
121    registry: Mutex<ScreenshotRegistry>,
122    /// Monotonic counter for temp file uniqueness. Distinct from the
123    /// registry id counter so filenames don't collide across runs
124    /// that share temp dir.
125    file_counter: AtomicU64,
126    /// Monotonic counter for registry ids — stable across process
127    /// lifetime, survives evictions.
128    id_counter: AtomicU64,
129}
130
131impl ComputerUseDriver {
132    pub fn new(backend: Backend) -> Self {
133        Self {
134            backend,
135            registry: Mutex::new(ScreenshotRegistry::new()),
136            file_counter: AtomicU64::new(0),
137            id_counter: AtomicU64::new(0),
138        }
139    }
140
141    pub fn backend(&self) -> Backend {
142        self.backend
143    }
144
145    /// Cheap mid-call liveness check. Tools call this first inside
146    /// `execute()`; if the display went away after registration,
147    /// they return a clean error instead of hanging on subprocess
148    /// dispatch.
149    pub fn ensure_alive(&self) -> Result<(), String> {
150        if super::display_is_reachable(self.backend) {
151            Ok(())
152        } else {
153            Err(format!(
154                "Display unreachable (backend={:?}). Was the session \
155                 detached, or did `DISPLAY` change?",
156                self.backend
157            ))
158        }
159    }
160
161    /// Translate model-space coords to screen-space using the metadata
162    /// registered for `screenshot_id` (or the latest if None).
163    pub fn scale_coords(
164        &self,
165        x: i32,
166        y: i32,
167        screenshot_id: Option<u64>,
168    ) -> Result<(i32, i32), String> {
169        let reg = self.registry.lock().map_err(|e| e.to_string())?;
170        let meta = match screenshot_id {
171            Some(id) => reg.get(id).cloned().ok_or_else(|| {
172                format!(
173                    "Screenshot id {} not found in registry (likely evicted — capacity {}). \
174                     Take a fresh screenshot and retry with the new id.",
175                    id, SCREENSHOT_REGISTRY_CAPACITY
176                )
177            })?,
178            None => reg.latest().cloned().ok_or_else(|| {
179                "No screenshots registered yet — call `screenshot` before \
180                 `click` / `mouse_move`."
181                    .to_string()
182            })?,
183        };
184        Ok((
185            (x as f64 * meta.scale_factor) as i32 + meta.offset_x,
186            (y as f64 * meta.scale_factor) as i32 + meta.offset_y,
187        ))
188    }
189
190    /// Allocate a fresh registry id and record metadata.
191    pub fn register_screenshot(
192        &self,
193        scale_factor: f64,
194        offset_x: i32,
195        offset_y: i32,
196        kind: String,
197    ) -> u64 {
198        let id = self.id_counter.fetch_add(1, Ordering::Relaxed);
199        if let Ok(mut reg) = self.registry.lock() {
200            reg.push(ScreenshotMetadata {
201                id,
202                scale_factor,
203                offset_x,
204                offset_y,
205                kind,
206            });
207        }
208        id
209    }
210
211    /// Capture and return the encoded result. Respects cancellation
212    /// via `token.cancelled()` races in the subprocess wait.
213    pub async fn capture(
214        &self,
215        spec: ScreenshotSpec,
216        token: &CancellationToken,
217    ) -> Result<CaptureResult> {
218        self.ensure_alive()
219            .map_err(|error| anyhow::anyhow!(error))?;
220
221        let seq = self.file_counter.fetch_add(1, Ordering::Relaxed);
222        let temp_path = std::env::temp_dir().join(format!("mermaid-screenshot-{}.png", seq));
223        let temp_str = temp_path.to_string_lossy().to_string();
224        let _guard = TempFileGuard(temp_path.clone());
225
226        let (offset_x, offset_y, kind) =
227            dispatch_capture(self.backend, &spec, &temp_str, token).await?;
228
229        let scale_factor = downscale_if_needed(&temp_str, SCREENSHOT_MAX_WIDTH).await?;
230        let id = self.register_screenshot(scale_factor, offset_x, offset_y, kind.clone());
231
232        let raw_bytes = tokio::fs::read(&temp_path)
233            .await
234            .context("reading captured screenshot")?;
235        let width = read_png_width(&raw_bytes).unwrap_or(0);
236        let height = read_png_height(&raw_bytes).unwrap_or(0);
237        let base64_png = general_purpose::STANDARD.encode(&raw_bytes);
238
239        let offset_info = if offset_x != 0 || offset_y != 0 {
240            format!(", offset: +{}+{}", offset_x, offset_y)
241        } else {
242            String::new()
243        };
244        let summary = format!(
245            "Screenshot captured (id: {}, {}, {}x{}, scale: {:.2}x{})",
246            id, kind, width, height, scale_factor, offset_info
247        );
248
249        Ok(CaptureResult {
250            id,
251            base64_png,
252            raw_bytes,
253            width,
254            height,
255            scale_factor,
256            offset_x,
257            offset_y,
258            summary,
259        })
260    }
261
262    /// Convenience for click/type/key tools: capture the focused
263    /// window and return `(summary, base64_png)` for inclusion in
264    /// the tool's auto-screenshot. Best-effort — on error returns
265    /// `None` and the caller can fall back to a screenshot-less
266    /// outcome.
267    pub async fn capture_focused_for_autoshot(
268        &self,
269        token: &CancellationToken,
270    ) -> Option<(String, String)> {
271        let cap = self.capture(ScreenshotSpec::Focused, token).await.ok()?;
272        Some((cap.summary, cap.base64_png))
273    }
274
275    /// X11-only: verify the cursor actually landed where xdotool was
276    /// told to move it. Returns `Some(warning)` if the cursor ended
277    /// up more than `CURSOR_LANDED_TOLERANCE_PX` away (focus change,
278    /// window moved, WM rejected the move). `None` if within
279    /// tolerance or the probe itself failed (best-effort — never
280    /// blocks the click).
281    pub async fn check_cursor_landed(&self, sx: i32, sy: i32) -> Option<String> {
282        if !matches!(self.backend, Backend::X11) {
283            return None;
284        }
285        let out = run_cmd_stdout(Command::new("xdotool").arg("getmouselocation"))
286            .await
287            .ok()?;
288        let mut actual_x: Option<i32> = None;
289        let mut actual_y: Option<i32> = None;
290        for tok in out.split_whitespace() {
291            if let Some(v) = tok.strip_prefix("X:") {
292                actual_x = v.parse().ok();
293            } else if let Some(v) = tok.strip_prefix("Y:") {
294                actual_y = v.parse().ok();
295            }
296        }
297        let (ax, ay) = (actual_x?, actual_y?);
298        if (ax - sx).abs() > CURSOR_LANDED_TOLERANCE_PX
299            || (ay - sy).abs() > CURSOR_LANDED_TOLERANCE_PX
300        {
301            Some(format!(
302                "WARNING: cursor at ({}, {}), expected ({}, {}). Window may have moved \
303                 or focus changed before the click landed.",
304                ax, ay, sx, sy
305            ))
306        } else {
307            None
308        }
309    }
310}
311
312/// HiDPI fractional scaling can put the cursor a pixel or two off the
313/// exact target; >5px means something other than rounding is wrong.
314const CURSOR_LANDED_TOLERANCE_PX: i32 = 5;
315
316// ───── action dispatch (shared by click / type / key / scroll / move / list) ──
317
318impl ComputerUseDriver {
319    /// Click at the given SCREEN coordinates (already scaled by
320    /// `scale_coords`). `button` is `"left" | "middle" | "right"`.
321    pub async fn click(
322        &self,
323        sx: i32,
324        sy: i32,
325        button: &str,
326        token: &CancellationToken,
327    ) -> Result<()> {
328        let code = match button {
329            "middle" => "2",
330            "right" => "3",
331            _ => "1",
332        };
333        match self.backend {
334            Backend::X11 => {
335                run_cmd_cancellable(
336                    Command::new("xdotool").args([
337                        "mousemove",
338                        "--sync",
339                        &sx.to_string(),
340                        &sy.to_string(),
341                        "click",
342                        "--clearmodifiers",
343                        code,
344                    ]),
345                    token,
346                )
347                .await
348            },
349            Backend::Wayland => {
350                if !super::has_command("ydotool") {
351                    anyhow::bail!("ydotool required for Wayland mouse control")
352                }
353                run_cmd_cancellable(
354                    Command::new("ydotool").args([
355                        "mousemove",
356                        "--absolute",
357                        "-x",
358                        &sx.to_string(),
359                        "-y",
360                        &sy.to_string(),
361                    ]),
362                    token,
363                )
364                .await?;
365                run_cmd_cancellable(
366                    Command::new("ydotool").args(["click", &format!("0x{}", code)]),
367                    token,
368                )
369                .await
370            },
371            _ => anyhow::bail!("click not supported on this platform"),
372        }
373    }
374
375    /// Type text at the current focus. Per-keystroke delay from
376    /// `TYPE_KEY_DELAY_MS` — empirically needed for slow Electron /
377    /// web targets that drop characters at lower rates.
378    pub async fn type_text(&self, text: &str, token: &CancellationToken) -> Result<()> {
379        let delay = crate::constants::TYPE_KEY_DELAY_MS.to_string();
380        match self.backend {
381            Backend::X11 => {
382                run_cmd_cancellable(
383                    Command::new("xdotool").args([
384                        "type",
385                        "--clearmodifiers",
386                        "--delay",
387                        &delay,
388                        text,
389                    ]),
390                    token,
391                )
392                .await
393            },
394            Backend::Wayland => {
395                if super::has_command("wtype") {
396                    run_cmd_cancellable(Command::new("wtype").arg(text), token).await
397                } else if super::has_command("ydotool") {
398                    run_cmd_cancellable(
399                        Command::new("ydotool").args(["type", "--delay", &delay, text]),
400                        token,
401                    )
402                    .await
403                } else {
404                    anyhow::bail!("wtype or ydotool required for Wayland text input")
405                }
406            },
407            _ => anyhow::bail!("type_text not supported on this platform"),
408        }
409    }
410
411    /// Press a key (or key combination like `"ctrl+shift+t"`).
412    pub async fn press_key(&self, key: &str, token: &CancellationToken) -> Result<()> {
413        match self.backend {
414            Backend::X11 => {
415                run_cmd_cancellable(Command::new("xdotool").args(["key", key]), token).await
416            },
417            Backend::Wayland => {
418                if super::has_command("wtype") {
419                    // wtype: -M/-m modifiers around -k final key.
420                    let parts: Vec<&str> = key.split('+').collect();
421                    let mut args: Vec<String> = Vec::new();
422                    for (i, part) in parts.iter().enumerate() {
423                        if i < parts.len() - 1 {
424                            args.push("-M".to_string());
425                            args.push(part.to_string());
426                        } else {
427                            args.push("-k".to_string());
428                            args.push(part.to_string());
429                        }
430                    }
431                    for part in parts.iter().take(parts.len().saturating_sub(1)) {
432                        args.push("-m".to_string());
433                        args.push(part.to_string());
434                    }
435                    run_cmd_cancellable(Command::new("wtype").args(&args), token).await
436                } else if super::has_command("ydotool") {
437                    run_cmd_cancellable(Command::new("ydotool").args(["key", key]), token).await
438                } else {
439                    anyhow::bail!("wtype or ydotool required for Wayland key input")
440                }
441            },
442            _ => anyhow::bail!("press_key not supported on this platform"),
443        }
444    }
445
446    /// Scroll `amount` ticks in `direction` ("up" / "down").
447    pub async fn scroll(
448        &self,
449        direction: &str,
450        amount: i32,
451        token: &CancellationToken,
452    ) -> Result<()> {
453        match self.backend {
454            Backend::X11 => {
455                // xdotool: button 4 = scroll up, 5 = scroll down.
456                let button = if direction == "up" { "4" } else { "5" };
457                let mut args: Vec<String> = Vec::new();
458                for _ in 0..amount {
459                    args.push("click".to_string());
460                    args.push(button.to_string());
461                }
462                run_cmd_cancellable(Command::new("xdotool").args(&args), token).await
463            },
464            Backend::Wayland => {
465                if !super::has_command("ydotool") {
466                    anyhow::bail!("ydotool required for Wayland scroll")
467                }
468                let wheel_amount = if direction == "up" { -amount } else { amount };
469                run_cmd_cancellable(
470                    Command::new("ydotool").args([
471                        "mousemove",
472                        "--wheel",
473                        &wheel_amount.to_string(),
474                    ]),
475                    token,
476                )
477                .await
478            },
479            _ => anyhow::bail!("scroll not supported on this platform"),
480        }
481    }
482
483    /// Move the mouse cursor to SCREEN coords (already scaled).
484    pub async fn mouse_move(&self, sx: i32, sy: i32, token: &CancellationToken) -> Result<()> {
485        match self.backend {
486            Backend::X11 => {
487                run_cmd_cancellable(
488                    Command::new("xdotool").args([
489                        "mousemove",
490                        "--sync",
491                        &sx.to_string(),
492                        &sy.to_string(),
493                    ]),
494                    token,
495                )
496                .await
497            },
498            Backend::Wayland => {
499                if !super::has_command("ydotool") {
500                    anyhow::bail!("ydotool required for Wayland mouse control")
501                }
502                run_cmd_cancellable(
503                    Command::new("ydotool").args([
504                        "mousemove",
505                        "--absolute",
506                        "-x",
507                        &sx.to_string(),
508                        "-y",
509                        &sy.to_string(),
510                    ]),
511                    token,
512                )
513                .await
514            },
515            _ => anyhow::bail!("mouse_move not supported on this platform"),
516        }
517    }
518
519    /// List visible window titles. X11 only; Wayland has no portable
520    /// enumeration primitive.
521    pub async fn list_windows(&self, _token: &CancellationToken) -> Result<Vec<String>> {
522        if !matches!(self.backend, Backend::X11) {
523            anyhow::bail!(
524                "list_windows requires X11. Wayland has no portable window-enumeration \
525                 primitive. Run mermaid from an X11 session."
526            );
527        }
528        let wids =
529            run_cmd_stdout(Command::new("xdotool").args(["search", "--onlyvisible", "--name", ""]))
530                .await?;
531        let mut windows = Vec::new();
532        for wid in wids.lines() {
533            let wid = wid.trim();
534            if wid.is_empty() {
535                continue;
536            }
537            if let Ok(name) =
538                run_cmd_stdout(Command::new("xdotool").args(["getwindowname", wid])).await
539            {
540                let name = name.trim().to_string();
541                if !name.is_empty() && !windows.contains(&name) {
542                    windows.push(name);
543                }
544            }
545        }
546        Ok(windows)
547    }
548}
549
550// ───── RAII temp-file cleanup ──────────────────────────────────────
551
552struct TempFileGuard(PathBuf);
553
554impl Drop for TempFileGuard {
555    fn drop(&mut self) {
556        let _ = std::fs::remove_file(&self.0);
557    }
558}
559
560// ───── subprocess dispatch ─────────────────────────────────────────
561
562async fn dispatch_capture(
563    backend: Backend,
564    spec: &ScreenshotSpec,
565    out_path: &str,
566    token: &CancellationToken,
567) -> Result<(i32, i32, String)> {
568    // Returns (offset_x, offset_y, kind_label). Each branch `select!`s
569    // on `token.cancelled()` so Ctrl+C during a slow capture aborts
570    // the subprocess cleanly.
571    match (backend, spec) {
572        (Backend::X11, ScreenshotSpec::Fullscreen) => {
573            run_cmd_cancellable(Command::new("scrot").args(["-o", out_path]), token).await?;
574            Ok((0, 0, "fullscreen".to_string()))
575        },
576        (Backend::Wayland, ScreenshotSpec::Fullscreen) => {
577            run_cmd_cancellable(Command::new("grim").arg(out_path), token).await?;
578            Ok((0, 0, "fullscreen".to_string()))
579        },
580        (Backend::MacOS, ScreenshotSpec::Fullscreen) => {
581            run_cmd_cancellable(Command::new("screencapture").args(["-x", out_path]), token)
582                .await?;
583            Ok((0, 0, "fullscreen".to_string()))
584        },
585        (Backend::X11, ScreenshotSpec::Focused) => {
586            let (wx, wy) = get_focused_window_geometry_x11()
587                .await
588                .map(|(x, y, _, _)| (x, y))
589                .unwrap_or((0, 0));
590            run_cmd_cancellable(Command::new("scrot").args(["-u", "-o", out_path]), token).await?;
591            Ok((wx, wy, "focused window".to_string()))
592        },
593        (Backend::Wayland, ScreenshotSpec::Focused) => anyhow::bail!(
594            "Mode 'focused' not supported on Wayland (grim has no focused-window \
595             primitive). Use mode: 'fullscreen' or mode: 'monitor' with a specific \
596             output name."
597        ),
598        (Backend::MacOS, ScreenshotSpec::Focused) => {
599            run_cmd_cancellable(
600                Command::new("screencapture").args(["-x", "-W", out_path]),
601                token,
602            )
603            .await?;
604            Ok((0, 0, "focused window".to_string()))
605        },
606        (Backend::X11, ScreenshotSpec::Region(x, y, w, h)) => {
607            run_cmd_cancellable(
608                Command::new("scrot").args([
609                    "-a",
610                    &format!("{},{},{},{}", x, y, w, h),
611                    "-o",
612                    out_path,
613                ]),
614                token,
615            )
616            .await?;
617            Ok((*x, *y, format!("region {}x{}+{}+{}", w, h, x, y)))
618        },
619        (Backend::Wayland, ScreenshotSpec::Region(x, y, w, h)) => {
620            run_cmd_cancellable(
621                Command::new("grim").args(["-g", &format!("{},{} {}x{}", x, y, w, h), out_path]),
622                token,
623            )
624            .await?;
625            Ok((*x, *y, format!("region {}x{}+{}+{}", w, h, x, y)))
626        },
627        (Backend::X11, ScreenshotSpec::Monitor(name)) => {
628            let (mx, my, mw, mh) = parse_monitor_geometry_x11(name).await.ok_or_else(|| {
629                anyhow::anyhow!(
630                    "Monitor '{}' not found. Run `xrandr --query` to list outputs.",
631                    name
632                )
633            })?;
634            run_cmd_cancellable(
635                Command::new("scrot").args([
636                    "-a",
637                    &format!("{},{},{},{}", mx, my, mw, mh),
638                    "-o",
639                    out_path,
640                ]),
641                token,
642            )
643            .await?;
644            Ok((mx, my, format!("monitor {}", name)))
645        },
646        (Backend::Wayland, ScreenshotSpec::Monitor(name)) => {
647            run_cmd_cancellable(Command::new("grim").args(["-o", name, out_path]), token).await?;
648            Ok((0, 0, format!("monitor {}", name)))
649        },
650        (Backend::X11, ScreenshotSpec::Window(title)) => {
651            // Search for window by name, activate it, sync, then
652            // capture the focused window.
653            let wid = run_cmd_stdout(Command::new("xdotool").args(["search", "--name", title]))
654                .await?
655                .lines()
656                .next()
657                .map(str::trim)
658                .filter(|s| !s.is_empty())
659                .map(str::to_string)
660                .ok_or_else(|| {
661                    anyhow::anyhow!(
662                        "No window found matching '{}'. Use list_windows to see available \
663                         windows.",
664                        title
665                    )
666                })?;
667            run_cmd_cancellable(
668                Command::new("xdotool").args(["windowactivate", "--sync", &wid]),
669                token,
670            )
671            .await?;
672            tokio::time::sleep(std::time::Duration::from_millis(
673                crate::constants::WINDOW_FOCUS_DELAY_MS,
674            ))
675            .await;
676            let (wx, wy) = get_window_geometry_x11(&wid)
677                .await
678                .map(|(x, y, _, _)| (x, y))
679                .unwrap_or((0, 0));
680            run_cmd_cancellable(Command::new("scrot").args(["-u", "-o", out_path]), token).await?;
681            Ok((wx, wy, format!("window \"{}\"", title)))
682        },
683        (Backend::Wayland, ScreenshotSpec::Window(_)) => anyhow::bail!(
684            "Mode 'window' not supported on Wayland (grim has no window-by-name capture). \
685             Use mode: 'fullscreen' or mode: 'monitor' with a specific output name."
686        ),
687        (Backend::MacOS, _) => anyhow::bail!(
688            "This screenshot mode is not yet ported to macOS. Use mode: 'fullscreen' for now."
689        ),
690        (Backend::Windows, _) | (Backend::Unsupported, _) => {
691            anyhow::bail!("Unsupported platform for computer-use capture")
692        },
693    }
694}
695
696/// Run a `Command` to completion, racing it against cancellation.
697/// Relies on `kill_on_drop(true)` reaping the child when the future
698/// is dropped on cancel.
699pub(crate) async fn run_cmd_cancellable(
700    cmd: &mut Command,
701    token: &CancellationToken,
702) -> Result<()> {
703    cmd.kill_on_drop(true);
704    tokio::select! {
705        biased;
706        _ = token.cancelled() => anyhow::bail!("cancelled"),
707        res = cmd.output() => {
708            let out = res.context("subprocess spawn")?;
709            if !out.status.success() {
710                anyhow::bail!(
711                    "subprocess failed: {}",
712                    String::from_utf8_lossy(&out.stderr).trim()
713                );
714            }
715            Ok(())
716        }
717    }
718}
719
720async fn run_cmd_stdout(cmd: &mut Command) -> Result<String> {
721    let out = cmd.output().await.context("subprocess spawn")?;
722    if !out.status.success() {
723        anyhow::bail!(
724            "subprocess failed: {}",
725            String::from_utf8_lossy(&out.stderr).trim()
726        );
727    }
728    Ok(String::from_utf8_lossy(&out.stdout).to_string())
729}
730
731// ───── geometry helpers (X11 only; Wayland has no equivalent) ──────
732
733async fn get_focused_window_geometry_x11() -> Option<(i32, i32, u32, u32)> {
734    let wid = run_cmd_stdout(Command::new("xdotool").arg("getactivewindow"))
735        .await
736        .ok()?;
737    let wid = wid.trim();
738    if wid.is_empty() {
739        return None;
740    }
741    get_window_geometry_x11(wid).await
742}
743
744async fn get_window_geometry_x11(wid: &str) -> Option<(i32, i32, u32, u32)> {
745    let out = run_cmd_stdout(Command::new("xdotool").args(["getwindowgeometry", "--shell", wid]))
746        .await
747        .ok()?;
748    let mut x = None;
749    let mut y = None;
750    let mut width = None;
751    let mut height = None;
752    for line in out.lines() {
753        if let Some(v) = line.strip_prefix("X=") {
754            x = v.parse().ok();
755        } else if let Some(v) = line.strip_prefix("Y=") {
756            y = v.parse().ok();
757        } else if let Some(v) = line.strip_prefix("WIDTH=") {
758            width = v.parse().ok();
759        } else if let Some(v) = line.strip_prefix("HEIGHT=") {
760            height = v.parse().ok();
761        }
762    }
763    Some((x?, y?, width?, height?))
764}
765
766async fn parse_monitor_geometry_x11(name: &str) -> Option<(i32, i32, u32, u32)> {
767    let out = run_cmd_stdout(Command::new("xrandr").arg("--query"))
768        .await
769        .ok()?;
770    for line in out.lines() {
771        if !line.contains(" connected") {
772            continue;
773        }
774        let parts: Vec<&str> = line.split_whitespace().collect();
775        if parts.first() != Some(&name) {
776            continue;
777        }
778        for part in &parts[2..] {
779            if let Some((res, offsets)) = part.split_once('+')
780                && let Some((w, h)) = res.split_once('x')
781            {
782                let width = w.parse::<u32>().ok()?;
783                let height = h.parse::<u32>().ok()?;
784                let mut off = offsets.splitn(2, '+');
785                let x = off.next()?.parse::<i32>().ok()?;
786                let y = off.next()?.parse::<i32>().ok()?;
787                return Some((x, y, width, height));
788            }
789        }
790    }
791    None
792}
793
794// ───── PNG inspection (no image crate dep) ─────────────────────────
795
796fn read_png_width(bytes: &[u8]) -> Option<u32> {
797    if bytes.len() > 24 && &bytes[0..8] == b"\x89PNG\r\n\x1a\n" {
798        Some(u32::from_be_bytes([
799            bytes[16], bytes[17], bytes[18], bytes[19],
800        ]))
801    } else {
802        None
803    }
804}
805
806fn read_png_height(bytes: &[u8]) -> Option<u32> {
807    if bytes.len() > 28 && &bytes[0..8] == b"\x89PNG\r\n\x1a\n" {
808        Some(u32::from_be_bytes([
809            bytes[20], bytes[21], bytes[22], bytes[23],
810        ]))
811    } else {
812        None
813    }
814}
815
816/// Downscale the PNG at `path` to at most `max_width` pixels wide,
817/// using ImageMagick `convert` or ffmpeg as a fallback. Returns the
818/// scale factor (original_width / max_width; 1.0 if no scaling was
819/// needed).
820async fn downscale_if_needed(path: &str, max_width: u32) -> Result<f64> {
821    let bytes = tokio::fs::read(path).await?;
822    let original_width = read_png_width(&bytes).unwrap_or(1920);
823    if original_width <= max_width {
824        return Ok(1.0);
825    }
826    let scale_factor = original_width as f64 / max_width as f64;
827    let scaled = format!("{}.scaled.png", path);
828
829    let convert = Command::new("convert")
830        .args([path, "-resize", &format!("{}x", max_width), &scaled])
831        .output()
832        .await;
833    if let Ok(o) = convert
834        && o.status.success()
835    {
836        tokio::fs::rename(&scaled, path).await?;
837        return Ok(scale_factor);
838    }
839
840    let ffmpeg = Command::new("ffmpeg")
841        .args([
842            "-y",
843            "-i",
844            path,
845            "-vf",
846            &format!("scale={}:-1", max_width),
847            &scaled,
848        ])
849        .output()
850        .await;
851    if let Ok(o) = ffmpeg
852        && o.status.success()
853    {
854        tokio::fs::rename(&scaled, path).await?;
855        return Ok(scale_factor);
856    }
857
858    let _ = tokio::fs::remove_file(&scaled).await;
859    tracing::warn!(
860        original_width,
861        "neither ImageMagick nor ffmpeg available; sending full-resolution screenshot"
862    );
863    Ok(1.0)
864}
865
866#[cfg(test)]
867mod tests {
868    use super::*;
869
870    #[test]
871    fn registry_lru_evicts_oldest_past_capacity() {
872        let mut r = ScreenshotRegistry::new();
873        for i in 0..(SCREENSHOT_REGISTRY_CAPACITY as u64 + 3) {
874            r.push(ScreenshotMetadata {
875                id: i,
876                scale_factor: 1.0,
877                offset_x: 0,
878                offset_y: 0,
879                kind: "fullscreen".to_string(),
880            });
881        }
882        assert_eq!(r.len(), SCREENSHOT_REGISTRY_CAPACITY);
883        // First 3 should have been evicted.
884        assert!(r.get(0).is_none());
885        assert!(r.get(1).is_none());
886        assert!(r.get(2).is_none());
887        // Latest remains.
888        assert_eq!(
889            r.latest().unwrap().id,
890            SCREENSHOT_REGISTRY_CAPACITY as u64 + 2
891        );
892    }
893
894    #[test]
895    fn scale_coords_applies_scale_and_offset() {
896        let d = ComputerUseDriver::new(Backend::X11);
897        let id = d.register_screenshot(2.0, 100, 50, "fullscreen".to_string());
898        let (sx, sy) = d.scale_coords(10, 20, Some(id)).unwrap();
899        assert_eq!(sx, 100 + 20);
900        assert_eq!(sy, 50 + 40);
901    }
902
903    #[test]
904    fn scale_coords_errors_on_evicted_id() {
905        let d = ComputerUseDriver::new(Backend::X11);
906        for _ in 0..(SCREENSHOT_REGISTRY_CAPACITY + 1) {
907            d.register_screenshot(1.0, 0, 0, "fullscreen".to_string());
908        }
909        // id 0 is evicted now.
910        let err = d.scale_coords(0, 0, Some(0)).unwrap_err();
911        assert!(
912            err.contains("evicted"),
913            "expected eviction message, got: {}",
914            err
915        );
916    }
917
918    #[test]
919    fn scale_coords_errors_with_no_screenshots_yet() {
920        let d = ComputerUseDriver::new(Backend::X11);
921        let err = d.scale_coords(10, 20, None).unwrap_err();
922        assert!(err.contains("No screenshots"));
923    }
924
925    #[test]
926    fn ensure_alive_fails_on_unsupported_backend() {
927        let d = ComputerUseDriver::new(Backend::Unsupported);
928        assert!(d.ensure_alive().is_err());
929    }
930}