Skip to main content

mermaid_cli/providers/tool/computer_use/
click.rs

1//! `click` — mouse click at model-space `(x, y)`. `screenshot_id`
2//! selects which capture the coords refer to (None = latest). After
3//! clicking, auto-captures the focused window so the model can
4//! verify the result inline.
5
6use std::sync::Arc;
7use std::time::Instant;
8
9use async_trait::async_trait;
10use serde_json::Value;
11
12use crate::constants::POST_CLICK_DELAY_MS;
13use crate::domain::{ToolDefinition, ToolOutcome};
14use crate::providers::ctx::{ExecContext, ProgressEvent};
15
16use super::super::ToolExecutor;
17use super::computer_use_success;
18use super::driver::ComputerUseDriver;
19
20pub struct ClickTool {
21    driver: Arc<ComputerUseDriver>,
22}
23
24impl ClickTool {
25    pub fn new(driver: Arc<ComputerUseDriver>) -> Self {
26        Self { driver }
27    }
28}
29
30#[async_trait]
31impl ToolExecutor for ClickTool {
32    fn name(&self) -> &'static str {
33        "click"
34    }
35
36    fn schema(&self) -> ToolDefinition {
37        ToolDefinition {
38            name: "click".to_string(),
39            description:
40                "Click at model-space (x, y). Pass `screenshot_id` to lock coordinates to a \
41                 specific past screenshot; omit for the most recent. Auto-captures the \
42                 focused window afterwards so the result is visible inline."
43                    .to_string(),
44            input_schema: serde_json::json!({
45                "type": "object",
46                "properties": {
47                    "x": { "type": "integer" },
48                    "y": { "type": "integer" },
49                    "button": { "type": "string", "enum": ["left", "middle", "right"], "default": "left" },
50                    "screenshot_id": { "type": "integer" }
51                },
52                "required": ["x", "y"]
53            }),
54        }
55    }
56
57    async fn execute(&self, args: Value, ctx: ExecContext) -> ToolOutcome {
58        let started = Instant::now();
59        if let Err(error) = self.driver.ensure_alive() {
60            return ToolOutcome::error(error, started.elapsed().as_secs_f64());
61        }
62
63        let x = args.get("x").and_then(|v| v.as_i64()).map(|n| n as i32);
64        let y = args.get("y").and_then(|v| v.as_i64()).map(|n| n as i32);
65        let (x, y) = match (x, y) {
66            (Some(x), Some(y)) => (x, y),
67            _ => {
68                return ToolOutcome::error(
69                    "click requires integer `x` and `y`",
70                    started.elapsed().as_secs_f64(),
71                );
72            },
73        };
74        let button = args
75            .get("button")
76            .and_then(|v| v.as_str())
77            .unwrap_or("left")
78            .to_string();
79        let screenshot_id = args.get("screenshot_id").and_then(|v| v.as_u64());
80
81        let (sx, sy) = match self.driver.scale_coords(x, y, screenshot_id) {
82            Ok(p) => p,
83            Err(e) => return ToolOutcome::error(e, started.elapsed().as_secs_f64()),
84        };
85
86        let click_res = tokio::select! {
87            biased;
88            _ = ctx.token.cancelled() => return ToolOutcome::cancelled(),
89            r = self.driver.click(sx, sy, &button, &ctx.token) => r,
90        };
91        if let Err(e) = click_res {
92            return ToolOutcome::error(
93                format!("click failed: {}", e),
94                started.elapsed().as_secs_f64(),
95            );
96        }
97
98        // Let the WM process focus change + UI update before the
99        // auto-screenshot captures the result.
100        tokio::time::sleep(std::time::Duration::from_millis(POST_CLICK_DELAY_MS)).await;
101
102        let mut msg = format!(
103            "Clicked {} at ({}, {}) [screen: ({}, {})]",
104            button, x, y, sx, sy
105        );
106        if let Some(warning) = self.driver.check_cursor_landed(sx, sy).await {
107            msg.push('\n');
108            msg.push_str(&warning);
109        }
110
111        let (summary, image) = match self.driver.capture_focused_for_autoshot(&ctx.token).await {
112            Some((s, b64)) => (Some(s), Some(b64)),
113            None => (None, None),
114        };
115
116        if let Some(b64) = &image
117            && let Ok(bytes) =
118                base64::Engine::decode(&base64::engine::general_purpose::STANDARD, b64)
119        {
120            let _ = ctx
121                .progress
122                .send(ProgressEvent::Artifact {
123                    mime: "image/png".to_string(),
124                    data: bytes,
125                    caption: Some("click auto-screenshot".to_string()),
126                })
127                .await;
128        }
129
130        let final_output = match &summary {
131            Some(s) => format!("{}\n[auto-screenshot: {}]", msg, s),
132            None => msg,
133        };
134        let mut outcome =
135            computer_use_success("click", args, final_output, started.elapsed().as_secs_f64());
136        if let Some(image) = image {
137            outcome = outcome.with_images(vec![image]);
138        }
139        outcome
140    }
141}