vmette_proto/agent.rs
1//! The host↔guest **computer-use vocabulary** spoken over vsock between the
2//! host [`Session`](https://docs.rs/vmette) (Agent workload) and the in-guest
3//! `vmette-desktop-agent`.
4//!
5//! These are pure types: a request is an [`Action`] serialized as the JSON
6//! header of a frame (no payload), and a reply is a [`ResponseHeader`]
7//! optionally followed by a binary payload (e.g. a screenshot PNG). The
8//! framing codec that moves them over the wire lives in `vmette::desktop`.
9
10use serde::{Deserialize, Serialize};
11
12/// A single computer-use action sent host → guest. Serialized as the JSON
13/// header of a request frame (no payload). Variants mirror the Anthropic
14/// computer-use tool so the MCP layer maps 1:1.
15#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
16#[serde(tag = "action", rename_all = "snake_case")]
17pub enum Action {
18 /// Capture the framebuffer; response carries a PNG payload. The PNG's pixel
19 /// dimensions are the coordinate space every pointer action targets
20 /// (top-left origin), so a caller can map a downscaled rendering back to
21 /// true coordinates.
22 Screenshot,
23 /// Report the current pointer position in the response header (`x`,`y`).
24 CursorPosition,
25 /// Absolute pointer move to `(x, y)`. The response header echoes the
26 /// *resulting* pointer position (`x`,`y`) — a window manager can constrain
27 /// the pointer, so the echo is the ground truth of where it landed.
28 MouseMove { x: i32, y: i32 },
29 /// Left button click at the current pointer position. The response header
30 /// echoes the pointer position the click fired at (`x`,`y`).
31 LeftClick,
32 /// Right button click at the current pointer position. The response header
33 /// echoes the pointer position the click fired at (`x`,`y`).
34 RightClick,
35 /// Middle button click at the current pointer position. The response header
36 /// echoes the pointer position the click fired at (`x`,`y`).
37 MiddleClick,
38 /// Double left click at the current pointer position. The response header
39 /// echoes the pointer position the click fired at (`x`,`y`).
40 DoubleClick,
41 /// Press-move-release: drag from the current position to `(x, y)`. The
42 /// response header echoes the resulting pointer position (`x`,`y`).
43 LeftClickDrag { x: i32, y: i32 },
44 /// Type a UTF-8 string via synthetic key events.
45 Type { text: String },
46 /// Press a key chord, e.g. `"ctrl+c"`, `"Return"`, `"alt+Tab"`.
47 Key { keys: String },
48 /// Scroll `amount` clicks in `direction` at `(x, y)`. The response header
49 /// echoes the resulting pointer position (`x`,`y`).
50 Scroll {
51 x: i32,
52 y: i32,
53 direction: ScrollDirection,
54 amount: i32,
55 },
56 /// Sleep `ms` milliseconds guest-side (lets UI settle).
57 Wait { ms: u64 },
58 /// Launch a shell command in the desktop session (e.g. `"chromium &"`).
59 Exec { command: String },
60 /// Run `command` (via `/bin/sh -c`) to completion **synchronously**,
61 /// returning its combined stdout/stderr as the response **payload** (UTF-8)
62 /// and its exit status in the header's `exit_code` (`None` ⇒ it did not
63 /// exit cleanly — killed by the `timeout_ms` guard or a signal). Stdin is
64 /// `/dev/null`. The in-guest agent is single-threaded, so a long command
65 /// blocks every other action until it returns — intended for short,
66 /// terminating commands (read a file, run a probe), not GUI apps (use
67 /// [`Action::Exec`]/[`Action::Navigate`] for those). `timeout_ms` defaults
68 /// guest-side and is clamped below the host vsock read timeout.
69 ExecCapture {
70 command: String,
71 #[serde(default, skip_serializing_if = "Option::is_none")]
72 timeout_ms: Option<u64>,
73 },
74 /// Open `url` in the desktop's browser. The guest hands the URL to a
75 /// fixed launcher (`vmette-open`) **without a shell**, so the URL is never
76 /// word-split or interpreted — a deterministic, injection-safe alternative
77 /// to driving the address bar with synthetic keystrokes. Fire-and-forget:
78 /// returns a bare ok once the launcher is spawned, not when the page loads
79 /// (pair with a settle screenshot to wait for paint).
80 Navigate { url: String },
81 /// Replace the X clipboard (the `CLIPBOARD` and `PRIMARY` selections) with
82 /// `text`, so a subsequent paste (Ctrl+V in GUI apps, Shift+Insert /
83 /// middle-click in terminals) inserts it. Pairs with [`Action::Key`].
84 SetClipboard { text: String },
85 /// Read the X `CLIPBOARD` selection; the text is returned as the response
86 /// **payload** (UTF-8), not in the header — so arbitrary content needs no
87 /// JSON escaping. Empty when the clipboard is unset.
88 GetClipboard,
89}
90
91/// Scroll wheel direction for [`Action::Scroll`].
92#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
93#[serde(rename_all = "snake_case")]
94pub enum ScrollDirection {
95 Up,
96 Down,
97 Left,
98 Right,
99}
100
101/// JSON header of a response frame (guest → host). `ok` reports success;
102/// on failure `error` carries a message and no payload follows. `x`/`y`
103/// carry the pointer position: [`Action::CursorPosition`] reports it, and the
104/// pointer actions (move / clicks / drag / scroll) echo the *resulting*
105/// position so a caller can verify where the pointer actually landed.
106/// `exit_code` is populated by
107/// [`Action::ExecCapture`] (`None` ⇒ the command did not exit cleanly, e.g.
108/// it timed out). `payload_len` is the count of binary bytes (e.g. PNG)
109/// following this header in the frame.
110#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
111pub struct ResponseHeader {
112 pub ok: bool,
113 #[serde(default, skip_serializing_if = "Option::is_none")]
114 pub error: Option<String>,
115 #[serde(default, skip_serializing_if = "Option::is_none")]
116 pub x: Option<i32>,
117 #[serde(default, skip_serializing_if = "Option::is_none")]
118 pub y: Option<i32>,
119 #[serde(default, skip_serializing_if = "Option::is_none")]
120 pub exit_code: Option<i32>,
121 #[serde(default)]
122 pub payload_len: u32,
123}
124
125impl ResponseHeader {
126 /// A bare success header with no payload and no coordinates.
127 pub fn ok() -> Self {
128 Self {
129 ok: true,
130 error: None,
131 x: None,
132 y: None,
133 exit_code: None,
134 payload_len: 0,
135 }
136 }
137
138 /// A failure header carrying `msg`.
139 pub fn err(msg: impl Into<String>) -> Self {
140 Self {
141 ok: false,
142 error: Some(msg.into()),
143 x: None,
144 y: None,
145 exit_code: None,
146 payload_len: 0,
147 }
148 }
149}
150
151#[cfg(test)]
152mod tests {
153 use super::*;
154
155 #[test]
156 fn action_screenshot_serializes_with_tag() {
157 let j = serde_json::to_string(&Action::Screenshot).unwrap();
158 assert_eq!(j, r#"{"action":"screenshot"}"#);
159 }
160
161 #[test]
162 fn action_with_fields_round_trips() {
163 let a = Action::MouseMove { x: 10, y: 20 };
164 let j = serde_json::to_string(&a).unwrap();
165 assert_eq!(j, r#"{"action":"mouse_move","x":10,"y":20}"#);
166 let back: Action = serde_json::from_str(&j).unwrap();
167 assert_eq!(back, a);
168 }
169
170 #[test]
171 fn left_click_drag_round_trips() {
172 // The drag target's wire shape — the contract the CLI `drag` verb, the
173 // MCP `desktop_drag` tool, and the guest agent's interpolated drag all
174 // agree on. The agent reads `x`/`y` as the drag *end*; it starts at the
175 // current pointer.
176 let a = Action::LeftClickDrag { x: 640, y: 400 };
177 let j = serde_json::to_string(&a).unwrap();
178 assert_eq!(j, r#"{"action":"left_click_drag","x":640,"y":400}"#);
179 let back: Action = serde_json::from_str(&j).unwrap();
180 assert_eq!(back, a);
181 }
182
183 #[test]
184 fn scroll_direction_is_snake_case() {
185 let a = Action::Scroll {
186 x: 1,
187 y: 2,
188 direction: ScrollDirection::Down,
189 amount: 3,
190 };
191 let j = serde_json::to_string(&a).unwrap();
192 assert!(j.contains(r#""direction":"down""#));
193 let back: Action = serde_json::from_str(&j).unwrap();
194 assert_eq!(back, a);
195 }
196
197 #[test]
198 fn clipboard_actions_serialize_snake_case() {
199 assert_eq!(
200 serde_json::to_string(&Action::GetClipboard).unwrap(),
201 r#"{"action":"get_clipboard"}"#
202 );
203 let a = Action::SetClipboard { text: "hi".into() };
204 assert_eq!(
205 serde_json::to_string(&a).unwrap(),
206 r#"{"action":"set_clipboard","text":"hi"}"#
207 );
208 }
209
210 #[test]
211 fn type_and_key_round_trip() {
212 for a in [
213 Action::Type {
214 text: "hello world".into(),
215 },
216 Action::Key {
217 keys: "ctrl+c".into(),
218 },
219 Action::Exec {
220 command: "chromium &".into(),
221 },
222 Action::Navigate {
223 url: "https://example.com/a?b=c&d=e".into(),
224 },
225 Action::ExecCapture {
226 command: "cat /etc/os-release".into(),
227 timeout_ms: Some(5000),
228 },
229 Action::ExecCapture {
230 command: "ls".into(),
231 timeout_ms: None,
232 },
233 Action::SetClipboard {
234 text: "clip".into(),
235 },
236 Action::GetClipboard,
237 Action::Wait { ms: 500 },
238 ] {
239 let j = serde_json::to_string(&a).unwrap();
240 let back: Action = serde_json::from_str(&j).unwrap();
241 assert_eq!(back, a);
242 }
243 }
244
245 #[test]
246 fn response_header_ok_omits_optional_fields() {
247 let j = serde_json::to_string(&ResponseHeader::ok()).unwrap();
248 assert_eq!(j, r#"{"ok":true,"payload_len":0}"#);
249 }
250
251 #[test]
252 fn exec_capture_serializes_timeout_when_set() {
253 let a = Action::ExecCapture {
254 command: "ls".into(),
255 timeout_ms: None,
256 };
257 assert_eq!(
258 serde_json::to_string(&a).unwrap(),
259 r#"{"action":"exec_capture","command":"ls"}"#
260 );
261 let a = Action::ExecCapture {
262 command: "ls".into(),
263 timeout_ms: Some(2000),
264 };
265 assert_eq!(
266 serde_json::to_string(&a).unwrap(),
267 r#"{"action":"exec_capture","command":"ls","timeout_ms":2000}"#
268 );
269 }
270
271 #[test]
272 fn response_header_carries_exit_code() {
273 let h = ResponseHeader {
274 ok: true,
275 error: None,
276 x: None,
277 y: None,
278 exit_code: Some(0),
279 payload_len: 12,
280 };
281 let j = serde_json::to_string(&h).unwrap();
282 assert!(j.contains(r#""exit_code":0"#));
283 let back: ResponseHeader = serde_json::from_str(&j).unwrap();
284 assert_eq!(back, h);
285 }
286
287 #[test]
288 fn response_header_err_carries_message() {
289 let h = ResponseHeader::err("boom");
290 let j = serde_json::to_string(&h).unwrap();
291 assert!(j.contains(r#""ok":false"#));
292 assert!(j.contains(r#""error":"boom""#));
293 let back: ResponseHeader = serde_json::from_str(&j).unwrap();
294 assert_eq!(back, h);
295 }
296
297 #[test]
298 fn cursor_position_response_carries_coords() {
299 let h = ResponseHeader {
300 ok: true,
301 error: None,
302 x: Some(640),
303 y: Some(400),
304 exit_code: None,
305 payload_len: 0,
306 };
307 let j = serde_json::to_string(&h).unwrap();
308 let back: ResponseHeader = serde_json::from_str(&j).unwrap();
309 assert_eq!(back.x, Some(640));
310 assert_eq!(back.y, Some(400));
311 }
312}