Skip to main content

mermaid_cli/providers/tool/computer_use/
press_key.rs

1//! `press_key` — single key or key combination at current focus.
2//! Keys: simple names (`"Return"`, `"Escape"`, `"Tab"`, `"F5"`,
3//! …) or modifier chains (`"ctrl+shift+t"`, `"alt+Tab"`). Uses
4//! xdotool/wtype/ydotool syntax pass-through; the model is
5//! expected to know the platform convention.
6
7use std::sync::Arc;
8use std::time::Instant;
9
10use async_trait::async_trait;
11use serde_json::Value;
12
13use crate::constants::POST_KEY_DELAY_MS;
14use crate::domain::{ToolDefinition, ToolOutcome};
15use crate::providers::ctx::{ExecContext, ProgressEvent};
16
17use super::super::ToolExecutor;
18use super::computer_use_success;
19use super::driver::ComputerUseDriver;
20
21pub struct PressKeyTool {
22    driver: Arc<ComputerUseDriver>,
23}
24
25impl PressKeyTool {
26    pub fn new(driver: Arc<ComputerUseDriver>) -> Self {
27        Self { driver }
28    }
29}
30
31#[async_trait]
32impl ToolExecutor for PressKeyTool {
33    fn name(&self) -> &'static str {
34        "press_key"
35    }
36
37    fn schema(&self) -> ToolDefinition {
38        ToolDefinition {
39            name: "press_key".to_string(),
40            description: "Press a key or chord: 'Return', 'Escape', 'Tab', 'F5', 'ctrl+shift+t', \
41                 'alt+Tab', etc. Uses xdotool/wtype naming. ALWAYS click the target \
42                 window first. Auto-captures the focused window afterwards."
43                .to_string(),
44            input_schema: serde_json::json!({
45                "type": "object",
46                "properties": { "key": { "type": "string" } },
47                "required": ["key"]
48            }),
49        }
50    }
51
52    async fn execute(&self, args: Value, ctx: ExecContext) -> ToolOutcome {
53        let started = Instant::now();
54        if let Err(error) = self.driver.ensure_alive() {
55            return ToolOutcome::error(error, started.elapsed().as_secs_f64());
56        }
57        let key = match args.get("key").and_then(|v| v.as_str()) {
58            Some(s) => s.to_string(),
59            None => {
60                return ToolOutcome::error(
61                    "press_key requires `key` string",
62                    started.elapsed().as_secs_f64(),
63                );
64            },
65        };
66
67        let res = tokio::select! {
68            biased;
69            _ = ctx.token.cancelled() => return ToolOutcome::cancelled(),
70            r = self.driver.press_key(&key, &ctx.token) => r,
71        };
72        if let Err(e) = res {
73            return ToolOutcome::error(
74                format!("press_key failed: {}", e),
75                started.elapsed().as_secs_f64(),
76            );
77        }
78
79        tokio::time::sleep(std::time::Duration::from_millis(POST_KEY_DELAY_MS)).await;
80        let base_msg = format!("Pressed: {}", key);
81
82        let (summary, image) = match self.driver.capture_focused_for_autoshot(&ctx.token).await {
83            Some((s, b64)) => (Some(s), Some(b64)),
84            None => (None, None),
85        };
86
87        if let Some(b64) = &image
88            && let Ok(bytes) =
89                base64::Engine::decode(&base64::engine::general_purpose::STANDARD, b64)
90        {
91            let _ = ctx
92                .progress
93                .send(ProgressEvent::Artifact {
94                    mime: "image/png".to_string(),
95                    data: bytes,
96                    caption: Some("press_key auto-screenshot".to_string()),
97                })
98                .await;
99        }
100
101        let out = match &summary {
102            Some(s) => format!("{}\n[auto-screenshot: {}]", base_msg, s),
103            None => base_msg,
104        };
105        let mut outcome =
106            computer_use_success("press_key", args, out, started.elapsed().as_secs_f64());
107        if let Some(image) = image {
108            outcome = outcome.with_images(vec![image]);
109        }
110        outcome
111    }
112}