use anyhow::{Result, anyhow};
use serde_json::{Value, json};
use super::platform::{
jpeg_dimensions, is_cliclick_special_key, map_modifier, map_modifier_xdotool,
powershell_hidden, run_powershell_input, run_subprocess, win_map_key, win_mouse_click,
win_set_cursor,
};
impl super::runtime::AgentRuntime {
pub(crate) async fn tool_computer_use(&self, args: Value) -> Result<Value> {
let action = args["action"]
.as_str()
.ok_or_else(|| anyhow!("computer_use: `action` required"))?;
let is_macos = cfg!(target_os = "macos");
let is_windows = cfg!(target_os = "windows");
let xy = || {
(
args["x"].as_f64().unwrap_or(0.0) as i64,
args["y"].as_f64().unwrap_or(0.0) as i64,
)
};
match action {
"screenshot" => {
let tmp_path = std::env::temp_dir().join("rsclaw_screen.png");
let tmp_path_str = tmp_path.to_string_lossy().to_string();
let output = if is_macos {
tokio::process::Command::new("screencapture")
.args(["-x", &tmp_path_str])
.output()
.await
} else if is_windows {
let script = format!(
r#"
Add-Type -AssemblyName System.Windows.Forms
Add-Type -AssemblyName System.Drawing
$screen = [System.Windows.Forms.Screen]::PrimaryScreen
$bounds = $screen.Bounds
$bitmap = New-Object System.Drawing.Bitmap($bounds.Width, $bounds.Height)
$graphics = [System.Drawing.Graphics]::FromImage($bitmap)
$graphics.CopyFromScreen($bounds.Location, [System.Drawing.Point]::Empty, $bounds.Size)
$bitmap.Save('{}')
$graphics.Dispose()
$bitmap.Dispose()
"#,
tmp_path_str
);
powershell_hidden()
.args(["-Command", &script])
.output()
.await
} else {
let res = tokio::process::Command::new("scrot")
.arg(&tmp_path_str)
.output()
.await;
if res.is_err() || !res.as_ref().unwrap().status.success() {
tokio::process::Command::new("import")
.args(["-window", "root", &tmp_path_str])
.output()
.await
} else {
res
}
}
.map_err(|e| anyhow!("computer_use screenshot: {e}"))?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(anyhow!("computer_use screenshot failed: {stderr}"));
}
let raw_bytes = tokio::fs::read(&tmp_path)
.await
.map_err(|e| anyhow!("computer_use: failed to read screenshot: {e}"))?;
let (orig_w, orig_h) = if raw_bytes.len() >= 24 {
let w = u32::from_be_bytes([raw_bytes[16], raw_bytes[17], raw_bytes[18], raw_bytes[19]]);
let h = u32::from_be_bytes([raw_bytes[20], raw_bytes[21], raw_bytes[22], raw_bytes[23]]);
(w, h)
} else {
(0, 0)
};
const TARGET_WIDTH: u32 = 1024;
const JPG_QUALITY: u32 = 30;
let out_path = std::env::temp_dir().join("rsclaw_screen_out.jpg");
let out_str = out_path.to_string_lossy().to_string();
let need_resize = orig_w > TARGET_WIDTH;
let converted = if is_macos {
let mut sips_args = vec![];
if need_resize {
sips_args.extend_from_slice(&["--resampleWidth", "1024"]);
}
sips_args.extend_from_slice(&[
"-s", "format", "jpeg",
"-s", "formatOptions", "30",
&tmp_path_str,
"--out", &out_str,
]);
tokio::process::Command::new("sips")
.args(&sips_args)
.output()
.await
.map(|o| o.status.success())
.unwrap_or(false)
} else if is_windows {
let new_w = if need_resize { TARGET_WIDTH } else { orig_w };
let new_h = if need_resize {
(orig_h as f64 * TARGET_WIDTH as f64 / orig_w as f64) as u32
} else {
orig_h
};
let script = format!(
r#"
Add-Type -AssemblyName System.Drawing
$src = [System.Drawing.Image]::FromFile('{tmp_path_str}')
$dst = New-Object System.Drawing.Bitmap({new_w}, {new_h})
$g = [System.Drawing.Graphics]::FromImage($dst)
$g.InterpolationMode = [System.Drawing.Drawing2D.InterpolationMode]::HighQualityBicubic
$g.DrawImage($src, 0, 0, {new_w}, {new_h})
$codec = [System.Drawing.Imaging.ImageCodecInfo]::GetImageEncoders() | Where-Object {{ $_.MimeType -eq 'image/jpeg' }}
$params = New-Object System.Drawing.Imaging.EncoderParameters(1)
$params.Param[0] = New-Object System.Drawing.Imaging.EncoderParameter([System.Drawing.Imaging.Encoder]::Quality, [long]{JPG_QUALITY})
$dst.Save('{out_str}', $codec, $params)
$g.Dispose(); $dst.Dispose(); $src.Dispose()
"#
);
powershell_hidden()
.args(["-Command", &script])
.output()
.await
.map(|o| o.status.success())
.unwrap_or(false)
} else {
let resize_arg = if need_resize { "1024x" } else { "100%" };
tokio::process::Command::new("convert")
.args([&tmp_path_str, "-resize", resize_arg, "-quality", "30", &out_str])
.output()
.await
.map(|o| o.status.success())
.unwrap_or(false)
};
let (bytes, mime) = if converted {
let b = tokio::fs::read(&out_path).await.unwrap_or(raw_bytes);
let _ = tokio::fs::remove_file(&out_path).await;
(b, "image/jpeg")
} else {
(raw_bytes, "image/png")
};
let _ = tokio::fs::remove_file(&tmp_path).await;
let (width, height) = if mime == "image/jpeg" {
jpeg_dimensions(&bytes).unwrap_or((0, 0))
} else if bytes.len() >= 24 {
let w = u32::from_be_bytes([bytes[16], bytes[17], bytes[18], bytes[19]]);
let h = u32::from_be_bytes([bytes[20], bytes[21], bytes[22], bytes[23]]);
(w, h)
} else {
(0, 0)
};
use base64::Engine;
let b64 = base64::engine::general_purpose::STANDARD.encode(&bytes);
let scale = if width > 0 && orig_w > width { orig_w as f64 / width as f64 } else { 1.0 };
Ok(json!({
"action": "screenshot",
"image": format!("data:{mime};base64,{b64}"),
"width": width,
"height": height,
"original_width": orig_w,
"original_height": orig_h,
"scale": scale
}))
}
"mouse_move" => {
let (x, y) = xy();
if is_macos {
run_subprocess("cliclick", &[&format!("m:{x},{y}")]).await?;
} else if is_windows {
win_set_cursor(x, y).await?;
} else {
run_subprocess("xdotool", &["mousemove", &x.to_string(), &y.to_string()])
.await?;
}
Ok(json!({"action": "mouse_move", "ok": true}))
}
"mouse_click" | "left_click" => {
let (x, y) = xy();
let button = args["button"].as_str().unwrap_or("left");
if is_macos {
match button {
"right" => run_subprocess("cliclick", &[&format!("rc:{x},{y}")]).await?,
"middle" => {
run_subprocess("swift", &["-e", &format!(
"import CoreGraphics; \
let pt = CGPoint(x: {x}, y: {y}); \
if let d = CGEvent(mouseEventSource: nil, mouseType: .otherMouseDown, mouseCursorPosition: pt, mouseButton: .center), \
let u = CGEvent(mouseEventSource: nil, mouseType: .otherMouseUp, mouseCursorPosition: pt, mouseButton: .center) \
{{ d.post(tap: .cghidEventTap); u.post(tap: .cghidEventTap) }}"
)]).await?;
}
_ => run_subprocess("cliclick", &[&format!("c:{x},{y}")]).await?,
}
} else if is_windows {
win_mouse_click(x, y, button, 1).await?;
} else {
let btn = match button { "right" => "3", "middle" => "2", _ => "1" };
run_subprocess("xdotool", &["mousemove", "--sync", &x.to_string(), &y.to_string(), "click", btn]).await?;
}
Ok(json!({"action": "mouse_click", "button": button, "ok": true}))
}
"double_click" => {
let (x, y) = xy();
if is_macos {
run_subprocess("cliclick", &[&format!("dc:{x},{y}")]).await?;
} else if is_windows {
win_mouse_click(x, y, "left", 2).await?;
} else {
run_subprocess("xdotool", &["mousemove", "--sync", &x.to_string(), &y.to_string(), "click", "--repeat", "2", "--delay", "50", "1"]).await?;
}
Ok(json!({"action": "double_click", "ok": true}))
}
"triple_click" => {
let (x, y) = xy();
if is_macos {
let pos = format!("c:{x},{y}");
run_subprocess("cliclick", &[&pos, &pos, &pos]).await?;
} else if is_windows {
win_mouse_click(x, y, "left", 3).await?;
} else {
run_subprocess("xdotool", &["mousemove", "--sync", &x.to_string(), &y.to_string(), "click", "--repeat", "3", "--delay", "50", "1"]).await?;
}
Ok(json!({"action": "triple_click", "ok": true}))
}
"right_click" => {
let (x, y) = xy();
if is_macos {
run_subprocess("cliclick", &[&format!("rc:{x},{y}")]).await?;
} else if is_windows {
win_mouse_click(x, y, "right", 1).await?;
} else {
run_subprocess("xdotool", &["mousemove", "--sync", &x.to_string(), &y.to_string(), "click", "3"]).await?;
}
Ok(json!({"action": "right_click", "ok": true}))
}
"middle_click" => {
let (x, y) = xy();
if is_macos {
run_subprocess("swift", &["-e", &format!(
"import CoreGraphics; \
let pt = CGPoint(x: {x}, y: {y}); \
if let d = CGEvent(mouseEventSource: nil, mouseType: .otherMouseDown, mouseCursorPosition: pt, mouseButton: .center), \
let u = CGEvent(mouseEventSource: nil, mouseType: .otherMouseUp, mouseCursorPosition: pt, mouseButton: .center) \
{{ d.post(tap: .cghidEventTap); u.post(tap: .cghidEventTap) }}"
)]).await?;
} else if is_windows {
win_mouse_click(x, y, "middle", 1).await?;
} else {
run_subprocess("xdotool", &["mousemove", "--sync", &x.to_string(), &y.to_string(), "click", "2"]).await?;
}
Ok(json!({"action": "middle_click", "ok": true}))
}
"drag" => {
let x1 = args["x"].as_f64().unwrap_or(0.0) as i64;
let y1 = args["y"].as_f64().unwrap_or(0.0) as i64;
let x2 = args["to_x"].as_f64()
.ok_or_else(|| anyhow!("computer_use drag: `to_x` required"))? as i64;
let y2 = args["to_y"].as_f64()
.ok_or_else(|| anyhow!("computer_use drag: `to_y` required"))? as i64;
if is_macos {
run_subprocess("cliclick", &[&format!("dd:{x1},{y1}"), &format!("du:{x2},{y2}")]).await?;
} else if is_windows {
run_powershell_input(&format!(
r#"Add-Type @"
using System;
using System.Runtime.InteropServices;
public class WinDrag {{
[DllImport("user32.dll")] public static extern bool SetCursorPos(int x, int y);
[DllImport("user32.dll")] static extern void mouse_event(uint f, uint dx, uint dy, uint d, int e);
public static void Drag(int x1, int y1, int x2, int y2) {{
SetCursorPos(x1, y1);
System.Threading.Thread.Sleep(50);
mouse_event(0x0002, 0, 0, 0, 0); // LEFTDOWN
System.Threading.Thread.Sleep(50);
SetCursorPos(x2, y2);
System.Threading.Thread.Sleep(50);
mouse_event(0x0004, 0, 0, 0, 0); // LEFTUP
}}
}}
"@
[WinDrag]::Drag({x1}, {y1}, {x2}, {y2})"#
)).await?;
} else {
run_subprocess("xdotool", &[
"mousemove", &x1.to_string(), &y1.to_string(),
"mousedown", "1",
"mousemove", "--sync", &x2.to_string(), &y2.to_string(),
"mouseup", "1",
]).await?;
}
Ok(json!({"action": "drag", "from": [x1, y1], "to": [x2, y2], "ok": true}))
}
"scroll" => {
let (x, y) = xy();
let direction = args["direction"].as_str().unwrap_or("down");
let amount = args["amount"].as_i64().unwrap_or(3);
if is_macos {
if x != 0 || y != 0 {
run_subprocess("cliclick", &[&format!("m:{x},{y}")]).await?;
}
let (scroll_y, scroll_x) = match direction {
"up" => (amount, 0i64),
"down" => (-amount, 0),
"left" => (0, -amount),
"right" => (0, amount),
_ => (-amount, 0),
};
run_subprocess("swift", &["-e", &format!(
"import CoreGraphics; \
if let e = CGEvent(scrollWheelEvent2Source: nil, units: .line, \
wheelCount: 2, wheel1: Int32({scroll_y}), wheel2: Int32({scroll_x}), wheel3: 0) \
{{ e.post(tap: .cghidEventTap) }}"
)]).await?;
} else if is_windows {
if x != 0 || y != 0 {
win_set_cursor(x, y).await?;
}
let (wheel_flag, delta) = match direction {
"up" => ("0x0800", 120 * amount), "down" => ("0x0800", -120 * amount),
"left" => ("0x01000", 120 * amount), "right" => ("0x01000", -120 * amount),
_ => ("0x0800", -120 * amount),
};
run_powershell_input(&format!(
r#"Add-Type @"
using System;
using System.Runtime.InteropServices;
public class WinScroll {{
[DllImport("user32.dll")] static extern void mouse_event(uint f, uint dx, uint dy, int d, int e);
public static void Scroll(uint flag, int delta) {{
mouse_event(flag, 0, 0, delta, 0);
}}
}}
"@
[WinScroll]::Scroll({wheel_flag}, {delta})"#
)).await?;
} else {
if x != 0 || y != 0 {
run_subprocess("xdotool", &["mousemove", &x.to_string(), &y.to_string()]).await?;
}
let btn = match direction {
"up" => "4", "down" => "5", "left" => "6", "right" => "7", _ => "5",
};
run_subprocess("xdotool", &["click", "--repeat", &amount.to_string(), "--delay", "30", btn]).await?;
}
Ok(json!({"action": "scroll", "direction": direction, "amount": amount, "ok": true}))
}
"type" => {
let text = args["text"]
.as_str()
.ok_or_else(|| anyhow!("computer_use type: `text` required"))?;
if is_macos {
let escaped = text.replace('\\', "\\\\").replace('"', "\\\"");
run_subprocess(
"osascript",
&["-e", &format!("tell application \"System Events\" to keystroke \"{escaped}\"")],
).await?;
} else if is_windows {
let mut escaped = String::with_capacity(text.len() * 2);
for ch in text.chars() {
match ch {
'{' => escaped.push_str("{{}"),
'}' => escaped.push_str("{}}"),
'+' => escaped.push_str("{+}"),
'^' => escaped.push_str("{^}"),
'%' => escaped.push_str("{%}"),
'~' => escaped.push_str("{~}"),
'(' => escaped.push_str("{(}"),
')' => escaped.push_str("{)}"),
'\'' => escaped.push_str("''"),
other => escaped.push(other),
}
}
run_powershell_input(&format!(
"Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.SendKeys]::SendWait('{escaped}')"
)).await?;
} else {
run_subprocess("xdotool", &["type", "--clearmodifiers", text]).await?;
}
Ok(json!({"action": "type", "ok": true}))
}
"key" => {
let key = args["key"]
.as_str()
.ok_or_else(|| anyhow!("computer_use key: `key` required"))?;
if is_macos {
if key.contains('+') {
let parts: Vec<&str> = key.split('+').collect();
let mut cliclick_args: Vec<String> = Vec::new();
for &modifier in &parts[..parts.len() - 1] {
let m = map_modifier(modifier);
cliclick_args.push(format!("kd:{m}"));
}
let base = parts[parts.len() - 1];
if is_cliclick_special_key(base) {
cliclick_args.push(format!("kp:{base}"));
} else {
cliclick_args.push(format!("t:{base}"));
}
for &modifier in parts[..parts.len() - 1].iter().rev() {
let m = map_modifier(modifier);
cliclick_args.push(format!("ku:{m}"));
}
let refs: Vec<&str> = cliclick_args.iter().map(|s| s.as_str()).collect();
run_subprocess("cliclick", &refs).await?;
} else if is_cliclick_special_key(key) {
run_subprocess("cliclick", &[&format!("kp:{key}")]).await?;
} else {
let escaped = key.replace('\\', "\\\\").replace('"', "\\\"");
run_subprocess("osascript", &[
"-e", &format!("tell application \"System Events\" to keystroke \"{escaped}\""),
]).await?;
}
} else if is_windows {
let send_key = win_map_key(key);
let escaped = send_key.replace('\'', "''");
run_powershell_input(&format!(
"Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.SendKeys]::SendWait('{escaped}')"
)).await?;
} else {
run_subprocess("xdotool", &["key", key]).await?;
}
Ok(json!({"action": "key", "ok": true}))
}
"hold_key" => {
let key = args["key"]
.as_str()
.ok_or_else(|| anyhow!("computer_use hold_key: `key` required"))?;
let (x, y) = xy();
let sub_action = args["then"].as_str().unwrap_or("click");
if is_macos {
let m = map_modifier(key);
let click_cmd = match sub_action {
"double_click" => format!("dc:{x},{y}"),
"right_click" => format!("rc:{x},{y}"),
_ => format!("c:{x},{y}"),
};
run_subprocess("cliclick", &[&format!("kd:{m}"), &click_cmd, &format!("ku:{m}")]).await?;
} else if is_windows {
let key_lower = key.to_lowercase();
let vk = match key_lower.as_str() {
"ctrl" | "control" => "0x11",
"alt" => "0x12",
"shift" => "0x10",
"win" | "super" | "cmd" | "command" => "0x5B",
_ => "0x10", };
let clicks = match sub_action { "double_click" => 2, "triple_click" => 3, _ => 1 };
let btn_down = match sub_action { "right_click" => "0x0008", _ => "0x0002" };
let btn_up = match sub_action { "right_click" => "0x0010", _ => "0x0004" };
run_powershell_input(&format!(
r#"Add-Type @"
using System;
using System.Runtime.InteropServices;
public class WinHoldKey {{
[DllImport("user32.dll")] public static extern bool SetCursorPos(int x, int y);
[DllImport("user32.dll")] static extern void mouse_event(uint f, uint dx, uint dy, uint d, int e);
[DllImport("user32.dll")] static extern void keybd_event(byte vk, byte scan, uint flags, int extra);
public static void HoldAndClick(int x, int y, byte vk, uint down, uint up, int clicks) {{
keybd_event(vk, 0, 0, 0); // key down
SetCursorPos(x, y);
for (int i = 0; i < clicks; i++) {{
mouse_event(down, 0, 0, 0, 0);
mouse_event(up, 0, 0, 0, 0);
if (i < clicks - 1) System.Threading.Thread.Sleep(50);
}}
keybd_event(vk, 0, 2, 0); // key up (KEYEVENTF_KEYUP=2)
}}
}}
"@
[WinHoldKey]::HoldAndClick({x}, {y}, {vk}, {btn_down}, {btn_up}, {clicks})"#
)).await?;
} else {
let xdo_key = map_modifier_xdotool(key);
run_subprocess("xdotool", &["mousemove", &x.to_string(), &y.to_string()]).await?;
run_subprocess("xdotool", &["keydown", &xdo_key]).await?;
let repeat = match sub_action { "double_click" => "2", "triple_click" => "3", _ => "1" };
let btn = match sub_action { "right_click" => "3", _ => "1" };
run_subprocess("xdotool", &["click", "--repeat", repeat, "--delay", "50", btn]).await?;
run_subprocess("xdotool", &["keyup", &xdo_key]).await?;
}
Ok(json!({"action": "hold_key", "key": key, "then": sub_action, "ok": true}))
}
"cursor_position" => {
let pos = if is_macos {
let output = tokio::process::Command::new("cliclick")
.arg("p:.")
.output()
.await
.map_err(|e| anyhow!("cliclick: {e}"))?;
String::from_utf8_lossy(&output.stdout).trim().to_string()
} else if is_windows {
let output = powershell_hidden()
.args(["-Command",
"Add-Type -AssemblyName System.Windows.Forms; $p = [System.Windows.Forms.Cursor]::Position; \"$($p.X),$($p.Y)\""])
.output()
.await
.map_err(|e| anyhow!("powershell: {e}"))?;
String::from_utf8_lossy(&output.stdout).trim().to_string()
} else {
let output = tokio::process::Command::new("xdotool")
.args(["getmouselocation", "--shell"])
.output()
.await
.map_err(|e| anyhow!("xdotool: {e}"))?;
String::from_utf8_lossy(&output.stdout).trim().to_string()
};
let parts: Vec<&str> = pos.split(',').collect();
let (cx, cy) = if parts.len() >= 2 {
(
parts[0].trim().parse::<i64>().unwrap_or(0),
parts[1].trim().parse::<i64>().unwrap_or(0),
)
} else {
let mut cx = 0i64;
let mut cy = 0i64;
for line in pos.lines() {
if let Some(v) = line.strip_prefix("X=") {
cx = v.parse().unwrap_or(0);
} else if let Some(v) = line.strip_prefix("Y=") {
cy = v.parse().unwrap_or(0);
}
}
(cx, cy)
};
Ok(json!({"action": "cursor_position", "x": cx, "y": cy}))
}
"get_active_window" => {
let title = if is_macos {
let output = tokio::process::Command::new("osascript")
.args(["-e", "tell application \"System Events\" to get name of first process whose frontmost is true"])
.output()
.await
.map_err(|e| anyhow!("osascript: {e}"))?;
let app = String::from_utf8_lossy(&output.stdout).trim().to_string();
let output2 = tokio::process::Command::new("osascript")
.args(["-e", "tell application \"System Events\" to get name of front window of (first process whose frontmost is true)"])
.output()
.await;
let win = output2.map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string()).unwrap_or_default();
if win.is_empty() { app } else { format!("{app} — {win}") }
} else if is_windows {
let output = powershell_hidden()
.args(["-Command",
"Add-Type @\"\nusing System;\nusing System.Runtime.InteropServices;\npublic class WinTitle {\n [DllImport(\"user32.dll\")] static extern IntPtr GetForegroundWindow();\n [DllImport(\"user32.dll\")] static extern int GetWindowText(IntPtr h, System.Text.StringBuilder s, int n);\n public static string Get() {\n var sb = new System.Text.StringBuilder(256);\n GetWindowText(GetForegroundWindow(), sb, 256);\n return sb.ToString();\n }\n}\n\"@\n[WinTitle]::Get()"])
.output()
.await
.map_err(|e| anyhow!("powershell: {e}"))?;
String::from_utf8_lossy(&output.stdout).trim().to_string()
} else {
let output = tokio::process::Command::new("xdotool")
.args(["getactivewindow", "getwindowname"])
.output()
.await
.map_err(|e| anyhow!("xdotool: {e}"))?;
String::from_utf8_lossy(&output.stdout).trim().to_string()
};
Ok(json!({"action": "get_active_window", "title": title}))
}
"wait" => {
let ms = args["ms"].as_u64().unwrap_or(500).min(10000); tokio::time::sleep(std::time::Duration::from_millis(ms)).await;
Ok(json!({"action": "wait", "ms": ms, "ok": true}))
}
other => Err(anyhow!(
"computer_use: unsupported action `{other}` \
(supported: screenshot, mouse_move, mouse_click, double_click, triple_click, \
right_click, middle_click, drag, scroll, type, key, hold_key, cursor_position, \
get_active_window, wait)"
)),
}
}
}