mermaid_cli/providers/tool/computer_use/
click.rs1use std::sync::Arc;
7use std::time::Instant;
8
9use async_trait::async_trait;
10use serde_json::Value;
11
12use crate::constants::POST_CLICK_DELAY_MS;
13use crate::domain::{ToolDefinition, ToolOutcome};
14use crate::providers::ctx::{ExecContext, ProgressEvent};
15
16use super::super::ToolExecutor;
17use super::computer_use_success;
18use super::driver::ComputerUseDriver;
19
20pub struct ClickTool {
21 driver: Arc<ComputerUseDriver>,
22}
23
24impl ClickTool {
25 pub fn new(driver: Arc<ComputerUseDriver>) -> Self {
26 Self { driver }
27 }
28}
29
30#[async_trait]
31impl ToolExecutor for ClickTool {
32 fn name(&self) -> &'static str {
33 "click"
34 }
35
36 fn schema(&self) -> ToolDefinition {
37 ToolDefinition {
38 name: "click".to_string(),
39 description:
40 "Click at model-space (x, y). Pass `screenshot_id` to lock coordinates to a \
41 specific past screenshot; omit for the most recent. Auto-captures the \
42 focused window afterwards so the result is visible inline."
43 .to_string(),
44 input_schema: serde_json::json!({
45 "type": "object",
46 "properties": {
47 "x": { "type": "integer" },
48 "y": { "type": "integer" },
49 "button": { "type": "string", "enum": ["left", "middle", "right"], "default": "left" },
50 "screenshot_id": { "type": "integer" }
51 },
52 "required": ["x", "y"]
53 }),
54 }
55 }
56
57 async fn execute(&self, args: Value, ctx: ExecContext) -> ToolOutcome {
58 let started = Instant::now();
59 if let Err(error) = self.driver.ensure_alive() {
60 return ToolOutcome::error(error, started.elapsed().as_secs_f64());
61 }
62
63 let x = args.get("x").and_then(|v| v.as_i64()).map(|n| n as i32);
64 let y = args.get("y").and_then(|v| v.as_i64()).map(|n| n as i32);
65 let (x, y) = match (x, y) {
66 (Some(x), Some(y)) => (x, y),
67 _ => {
68 return ToolOutcome::error(
69 "click requires integer `x` and `y`",
70 started.elapsed().as_secs_f64(),
71 );
72 },
73 };
74 let button = args
75 .get("button")
76 .and_then(|v| v.as_str())
77 .unwrap_or("left")
78 .to_string();
79 let screenshot_id = args.get("screenshot_id").and_then(|v| v.as_u64());
80
81 let (sx, sy) = match self.driver.scale_coords(x, y, screenshot_id) {
82 Ok(p) => p,
83 Err(e) => return ToolOutcome::error(e, started.elapsed().as_secs_f64()),
84 };
85
86 let click_res = tokio::select! {
87 biased;
88 _ = ctx.token.cancelled() => return ToolOutcome::cancelled(),
89 r = self.driver.click(sx, sy, &button, &ctx.token) => r,
90 };
91 if let Err(e) = click_res {
92 return ToolOutcome::error(
93 format!("click failed: {}", e),
94 started.elapsed().as_secs_f64(),
95 );
96 }
97
98 tokio::time::sleep(std::time::Duration::from_millis(POST_CLICK_DELAY_MS)).await;
101
102 let mut msg = format!(
103 "Clicked {} at ({}, {}) [screen: ({}, {})]",
104 button, x, y, sx, sy
105 );
106 if let Some(warning) = self.driver.check_cursor_landed(sx, sy).await {
107 msg.push('\n');
108 msg.push_str(&warning);
109 }
110
111 let (summary, image) = match self.driver.capture_focused_for_autoshot(&ctx.token).await {
112 Some((s, b64)) => (Some(s), Some(b64)),
113 None => (None, None),
114 };
115
116 if let Some(b64) = &image
117 && let Ok(bytes) =
118 base64::Engine::decode(&base64::engine::general_purpose::STANDARD, b64)
119 {
120 let _ = ctx
121 .progress
122 .send(ProgressEvent::Artifact {
123 mime: "image/png".to_string(),
124 data: bytes,
125 caption: Some("click auto-screenshot".to_string()),
126 })
127 .await;
128 }
129
130 let final_output = match &summary {
131 Some(s) => format!("{}\n[auto-screenshot: {}]", msg, s),
132 None => msg,
133 };
134 let mut outcome =
135 computer_use_success("click", args, final_output, started.elapsed().as_secs_f64());
136 if let Some(image) = image {
137 outcome = outcome.with_images(vec![image]);
138 }
139 outcome
140 }
141}