Skip to main content

vmette_proto/
agent.rs

1//! The host↔guest **computer-use vocabulary** spoken over vsock between the
2//! host [`Session`](https://docs.rs/vmette) (Agent workload) and the in-guest
3//! `vmette-desktop-agent`.
4//!
5//! These are pure types: a request is an [`Action`] serialized as the JSON
6//! header of a frame (no payload), and a reply is a [`ResponseHeader`]
7//! optionally followed by a binary payload (e.g. a screenshot PNG). The
8//! framing codec that moves them over the wire lives in `vmette::desktop`.
9
10use serde::{Deserialize, Serialize};
11
12/// A single computer-use action sent host → guest. Serialized as the JSON
13/// header of a request frame (no payload). Variants mirror the Anthropic
14/// computer-use tool so the MCP layer maps 1:1.
15#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
16#[serde(tag = "action", rename_all = "snake_case")]
17pub enum Action {
18    /// Capture the framebuffer; response carries a PNG payload. The PNG's pixel
19    /// dimensions are the coordinate space every pointer action targets
20    /// (top-left origin), so a caller can map a downscaled rendering back to
21    /// true coordinates.
22    Screenshot,
23    /// Report the current pointer position in the response header (`x`,`y`).
24    CursorPosition,
25    /// Absolute pointer move to `(x, y)`. The response header echoes the
26    /// *resulting* pointer position (`x`,`y`) — a window manager can constrain
27    /// the pointer, so the echo is the ground truth of where it landed.
28    MouseMove { x: i32, y: i32 },
29    /// Left button click at the current pointer position. The response header
30    /// echoes the pointer position the click fired at (`x`,`y`).
31    LeftClick,
32    /// Right button click at the current pointer position. The response header
33    /// echoes the pointer position the click fired at (`x`,`y`).
34    RightClick,
35    /// Middle button click at the current pointer position. The response header
36    /// echoes the pointer position the click fired at (`x`,`y`).
37    MiddleClick,
38    /// Double left click at the current pointer position. The response header
39    /// echoes the pointer position the click fired at (`x`,`y`).
40    DoubleClick,
41    /// Press-move-release: drag from the current position to `(x, y)`. The
42    /// response header echoes the resulting pointer position (`x`,`y`).
43    LeftClickDrag { x: i32, y: i32 },
44    /// Type a UTF-8 string via synthetic key events.
45    Type { text: String },
46    /// Press a key chord, e.g. `"ctrl+c"`, `"Return"`, `"alt+Tab"`.
47    Key { keys: String },
48    /// Scroll `amount` clicks in `direction` at `(x, y)`. The response header
49    /// echoes the resulting pointer position (`x`,`y`).
50    Scroll {
51        x: i32,
52        y: i32,
53        direction: ScrollDirection,
54        amount: i32,
55    },
56    /// Sleep `ms` milliseconds guest-side (lets UI settle).
57    Wait { ms: u64 },
58    /// Launch a shell command in the desktop session (e.g. `"chromium &"`).
59    Exec { command: String },
60    /// Run `command` (via `/bin/sh -c`) to completion **synchronously**,
61    /// returning its combined stdout/stderr as the response **payload** (UTF-8)
62    /// and its exit status in the header's `exit_code` (`None` ⇒ it did not
63    /// exit cleanly — killed by the `timeout_ms` guard or a signal). Stdin is
64    /// `/dev/null`. The in-guest agent is single-threaded, so a long command
65    /// blocks every other action until it returns — intended for short,
66    /// terminating commands (read a file, run a probe), not GUI apps (use
67    /// [`Action::Exec`]/[`Action::Navigate`] for those). `timeout_ms` defaults
68    /// guest-side and is clamped below the host vsock read timeout.
69    ExecCapture {
70        command: String,
71        #[serde(default, skip_serializing_if = "Option::is_none")]
72        timeout_ms: Option<u64>,
73    },
74    /// Open `url` in the desktop's browser. The guest hands the URL to a
75    /// fixed launcher (`vmette-open`) **without a shell**, so the URL is never
76    /// word-split or interpreted — a deterministic, injection-safe alternative
77    /// to driving the address bar with synthetic keystrokes. Fire-and-forget:
78    /// returns a bare ok once the launcher is spawned, not when the page loads
79    /// (pair with a settle screenshot to wait for paint).
80    Navigate { url: String },
81    /// Replace the X clipboard (the `CLIPBOARD` and `PRIMARY` selections) with
82    /// `text`, so a subsequent paste (Ctrl+V in GUI apps, Shift+Insert /
83    /// middle-click in terminals) inserts it. Pairs with [`Action::Key`].
84    SetClipboard { text: String },
85    /// Read the X `CLIPBOARD` selection; the text is returned as the response
86    /// **payload** (UTF-8), not in the header — so arbitrary content needs no
87    /// JSON escaping. Empty when the clipboard is unset.
88    GetClipboard,
89}
90
91/// Scroll wheel direction for [`Action::Scroll`].
92#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
93#[serde(rename_all = "snake_case")]
94pub enum ScrollDirection {
95    Up,
96    Down,
97    Left,
98    Right,
99}
100
101/// JSON header of a response frame (guest → host). `ok` reports success;
102/// on failure `error` carries a message and no payload follows. `x`/`y`
103/// carry the pointer position: [`Action::CursorPosition`] reports it, and the
104/// pointer actions (move / clicks / drag / scroll) echo the *resulting*
105/// position so a caller can verify where the pointer actually landed.
106/// `exit_code` is populated by
107/// [`Action::ExecCapture`] (`None` ⇒ the command did not exit cleanly, e.g.
108/// it timed out). `payload_len` is the count of binary bytes (e.g. PNG)
109/// following this header in the frame.
110#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
111pub struct ResponseHeader {
112    pub ok: bool,
113    #[serde(default, skip_serializing_if = "Option::is_none")]
114    pub error: Option<String>,
115    #[serde(default, skip_serializing_if = "Option::is_none")]
116    pub x: Option<i32>,
117    #[serde(default, skip_serializing_if = "Option::is_none")]
118    pub y: Option<i32>,
119    #[serde(default, skip_serializing_if = "Option::is_none")]
120    pub exit_code: Option<i32>,
121    #[serde(default)]
122    pub payload_len: u32,
123}
124
125impl ResponseHeader {
126    /// A bare success header with no payload and no coordinates.
127    pub fn ok() -> Self {
128        Self {
129            ok: true,
130            error: None,
131            x: None,
132            y: None,
133            exit_code: None,
134            payload_len: 0,
135        }
136    }
137
138    /// A failure header carrying `msg`.
139    pub fn err(msg: impl Into<String>) -> Self {
140        Self {
141            ok: false,
142            error: Some(msg.into()),
143            x: None,
144            y: None,
145            exit_code: None,
146            payload_len: 0,
147        }
148    }
149}
150
151#[cfg(test)]
152mod tests {
153    use super::*;
154
155    #[test]
156    fn action_screenshot_serializes_with_tag() {
157        let j = serde_json::to_string(&Action::Screenshot).unwrap();
158        assert_eq!(j, r#"{"action":"screenshot"}"#);
159    }
160
161    #[test]
162    fn action_with_fields_round_trips() {
163        let a = Action::MouseMove { x: 10, y: 20 };
164        let j = serde_json::to_string(&a).unwrap();
165        assert_eq!(j, r#"{"action":"mouse_move","x":10,"y":20}"#);
166        let back: Action = serde_json::from_str(&j).unwrap();
167        assert_eq!(back, a);
168    }
169
170    #[test]
171    fn left_click_drag_round_trips() {
172        // The drag target's wire shape — the contract the CLI `drag` verb, the
173        // MCP `desktop_drag` tool, and the guest agent's interpolated drag all
174        // agree on. The agent reads `x`/`y` as the drag *end*; it starts at the
175        // current pointer.
176        let a = Action::LeftClickDrag { x: 640, y: 400 };
177        let j = serde_json::to_string(&a).unwrap();
178        assert_eq!(j, r#"{"action":"left_click_drag","x":640,"y":400}"#);
179        let back: Action = serde_json::from_str(&j).unwrap();
180        assert_eq!(back, a);
181    }
182
183    #[test]
184    fn scroll_direction_is_snake_case() {
185        let a = Action::Scroll {
186            x: 1,
187            y: 2,
188            direction: ScrollDirection::Down,
189            amount: 3,
190        };
191        let j = serde_json::to_string(&a).unwrap();
192        assert!(j.contains(r#""direction":"down""#));
193        let back: Action = serde_json::from_str(&j).unwrap();
194        assert_eq!(back, a);
195    }
196
197    #[test]
198    fn clipboard_actions_serialize_snake_case() {
199        assert_eq!(
200            serde_json::to_string(&Action::GetClipboard).unwrap(),
201            r#"{"action":"get_clipboard"}"#
202        );
203        let a = Action::SetClipboard { text: "hi".into() };
204        assert_eq!(
205            serde_json::to_string(&a).unwrap(),
206            r#"{"action":"set_clipboard","text":"hi"}"#
207        );
208    }
209
210    #[test]
211    fn type_and_key_round_trip() {
212        for a in [
213            Action::Type {
214                text: "hello world".into(),
215            },
216            Action::Key {
217                keys: "ctrl+c".into(),
218            },
219            Action::Exec {
220                command: "chromium &".into(),
221            },
222            Action::Navigate {
223                url: "https://example.com/a?b=c&d=e".into(),
224            },
225            Action::ExecCapture {
226                command: "cat /etc/os-release".into(),
227                timeout_ms: Some(5000),
228            },
229            Action::ExecCapture {
230                command: "ls".into(),
231                timeout_ms: None,
232            },
233            Action::SetClipboard {
234                text: "clip".into(),
235            },
236            Action::GetClipboard,
237            Action::Wait { ms: 500 },
238        ] {
239            let j = serde_json::to_string(&a).unwrap();
240            let back: Action = serde_json::from_str(&j).unwrap();
241            assert_eq!(back, a);
242        }
243    }
244
245    #[test]
246    fn response_header_ok_omits_optional_fields() {
247        let j = serde_json::to_string(&ResponseHeader::ok()).unwrap();
248        assert_eq!(j, r#"{"ok":true,"payload_len":0}"#);
249    }
250
251    #[test]
252    fn exec_capture_serializes_timeout_when_set() {
253        let a = Action::ExecCapture {
254            command: "ls".into(),
255            timeout_ms: None,
256        };
257        assert_eq!(
258            serde_json::to_string(&a).unwrap(),
259            r#"{"action":"exec_capture","command":"ls"}"#
260        );
261        let a = Action::ExecCapture {
262            command: "ls".into(),
263            timeout_ms: Some(2000),
264        };
265        assert_eq!(
266            serde_json::to_string(&a).unwrap(),
267            r#"{"action":"exec_capture","command":"ls","timeout_ms":2000}"#
268        );
269    }
270
271    #[test]
272    fn response_header_carries_exit_code() {
273        let h = ResponseHeader {
274            ok: true,
275            error: None,
276            x: None,
277            y: None,
278            exit_code: Some(0),
279            payload_len: 12,
280        };
281        let j = serde_json::to_string(&h).unwrap();
282        assert!(j.contains(r#""exit_code":0"#));
283        let back: ResponseHeader = serde_json::from_str(&j).unwrap();
284        assert_eq!(back, h);
285    }
286
287    #[test]
288    fn response_header_err_carries_message() {
289        let h = ResponseHeader::err("boom");
290        let j = serde_json::to_string(&h).unwrap();
291        assert!(j.contains(r#""ok":false"#));
292        assert!(j.contains(r#""error":"boom""#));
293        let back: ResponseHeader = serde_json::from_str(&j).unwrap();
294        assert_eq!(back, h);
295    }
296
297    #[test]
298    fn cursor_position_response_carries_coords() {
299        let h = ResponseHeader {
300            ok: true,
301            error: None,
302            x: Some(640),
303            y: Some(400),
304            exit_code: None,
305            payload_len: 0,
306        };
307        let j = serde_json::to_string(&h).unwrap();
308        let back: ResponseHeader = serde_json::from_str(&j).unwrap();
309        assert_eq!(back.x, Some(640));
310        assert_eq!(back.y, Some(400));
311    }
312}