mur_common/mobile.rs
1//! Wire protocol for the MUR mobile app ↔ Mac daemon WebSocket endpoint.
2//!
3//! Shared by `mur-mobile-sdk` (the phone client) and `mur-daemon` (the Mac
4//! endpoint) so both ends agree on the framing. Every message is one JSON
5//! object sent as a WebSocket *text* frame. The first client frame is a
6//! [`ClientFrame::Hello`] pairing handshake; once the server replies
7//! [`ServerFrame::Paired`], application traffic is carried as Ed25519-signed
8//! [`SignedEnvelope`]s whose `payload` is the canonical JSON of an A2A
9//! `JsonRpcRequest` — the same crypto MUR uses for agent↔agent bridge traffic.
10//!
11//! P3 adds a voice streaming path: the phone streams raw 16 kHz mono f32 PCM
12//! chunks, the Mac runs whisper.cpp for an authoritative transcript, then
13//! replies with Kokoro TTS audio chunks.
14//!
15//! Design: `docs/superpowers/specs/2026-06-05-mur-voice-mobile-app-design.md`.
16
17use crate::bridge::envelope::SignedEnvelope;
18use serde::{Deserialize, Serialize};
19
20/// WebSocket path the daemon's mobile endpoint serves.
21pub const MOBILE_WS_PATH: &str = "/api/v1/mobile/ws";
22
23/// A2A method (carried inside a signed [`ClientFrame::Envelope`]) by which the
24/// phone authoritatively responds to a HITL gate (v4c). It rides the signed
25/// envelope path — NOT a plain frame — so the daemon verifies the phone's
26/// Ed25519 signature before writing the gate-releasing `HitlResponse`.
27pub const HITL_RESPOND_METHOD: &str = "channel/hitl_respond";
28
29/// Frames the phone sends to the Mac endpoint.
30#[derive(Debug, Serialize, Deserialize)]
31#[serde(tag = "type", rename_all = "snake_case")]
32pub enum ClientFrame {
33 /// Pairing / auth handshake. `token` is the one-time token from the QR
34 /// code; `pubkey` is the phone's multibase Ed25519 public key, which the
35 /// Mac records as a paired device on success. `agent` is the canonical
36 /// agent name the phone wants to talk to (e.g. `"mur"`).
37 Hello {
38 pubkey: String,
39 token: String,
40 agent: String,
41 },
42 /// Enrollment step 1 (proto ≥ 2): the phone announces intent WITHOUT sending
43 /// the token. `wid` is the pairing-window id from the QR; `pubkey` is the
44 /// phone's Ed25519 key to enroll. The daemon replies [`ServerFrame::PairChallenge`].
45 HelloInit {
46 proto: u32,
47 agent: String,
48 pubkey: String,
49 wid: String,
50 },
51 /// Enrollment step 3: proof of token possession. `proof` is
52 /// HMAC-SHA256(token, transcript) — the token itself is never transmitted.
53 HelloProof { wid: String, proof: Vec<u8> },
54 /// Reconnect by paired KEY, no enrollment token (the steady-state auth). The
55 /// phone announces its `pubkey`; the daemon replies [`ServerFrame::Challenge`]
56 /// with a fresh per-connection nonce, which the phone signs back in a
57 /// [`ClientFrame::ResumeProof`]. Distinct from `Hello` so single-use
58 /// enrollment windows never gate reconnects.
59 Resume { pubkey: String, agent: String },
60 /// Proof for a `Resume`: a [`SignedEnvelope`] whose `payload` is exactly the
61 /// challenge nonce bytes, signed by the device key. The daemon-issued nonce
62 /// makes a captured proof non-replayable on a new connection.
63 ResumeProof { envelope: SignedEnvelope },
64 /// A signed A2A request destined for the agent (text-only path).
65 Envelope { envelope: SignedEnvelope },
66 /// Phone begins a voice utterance. The Mac clears its audio accumulator and
67 /// prepares for incoming chunks at `sample_rate` Hz, mono f32.
68 AudioStreamStart { sample_rate: u32 },
69 /// One chunk of raw PCM (f32 LE, `sample_rate` Hz mono, standard base64).
70 /// Authenticated by the connection (paired at `Hello`); no per-chunk sig.
71 AudioChunk { data: String },
72 /// Phone finished speaking. Mac should run STT on the accumulated audio,
73 /// forward to the agent, and stream TTS audio back.
74 AudioStreamEnd,
75 /// Pull channel data. `op` ∈ "list" | "events". For "events", `channel_id`
76 /// is required and `since_seq` (inclusive) enables catch-up. Authenticated
77 /// by the paired connection (like the audio frames).
78 ChannelQuery {
79 op: String,
80 #[serde(default)]
81 channel_id: Option<String>,
82 #[serde(default)]
83 since_seq: Option<u64>,
84 },
85}
86
87/// Frames the Mac endpoint sends back to the phone.
88#[derive(Debug, Serialize, Deserialize)]
89#[serde(tag = "type", rename_all = "snake_case")]
90pub enum ServerFrame {
91 /// Handshake accepted; the phone is now paired with `agent`. For a proof
92 /// enrollment (proto ≥ 2), `confirm` is HMAC-SHA256(token, daemon→phone
93 /// transcript) — the phone verifies it to authenticate the daemon (closes
94 /// rogue-daemon/relay MITM). Empty for legacy Hello / Resume paths.
95 Paired {
96 agent: String,
97 #[serde(default)]
98 confirm: Vec<u8>,
99 },
100 /// Enrollment step 2 (proto ≥ 2): challenge for a [`ClientFrame::HelloInit`].
101 /// `nonce` is a fresh 32-byte liveness value; `did` is the daemon agent's
102 /// Ed25519 identity (multibase), which the phone cross-checks against the QR
103 /// to bind the endpoint on the TLS-less LAN.
104 PairChallenge {
105 wid: String,
106 nonce: Vec<u8>,
107 did: String,
108 },
109 /// The handshake or a later frame was rejected.
110 Rejected { reason: String },
111 /// Challenge issued in reply to a [`ClientFrame::Resume`]: a fresh
112 /// per-connection nonce the phone must sign (in a `ResumeProof`) to prove it
113 /// holds the paired device key.
114 Challenge { nonce: String },
115 /// An asynchronous event mirrored to the phone. `name` is dot-namespaced
116 /// (`mobile.transcript`, `mobile.reply`, …) to match the Hub `EventBus`
117 /// names used for desktop mirroring.
118 Event {
119 name: String,
120 payload: serde_json::Value,
121 },
122 /// Mac whisper.cpp authoritative transcript for the user's just-spoken
123 /// utterance. Overrides the on-device SFSpeech partial. `is_final: true`
124 /// means this is the definitive text for this turn.
125 Transcript { text: String, is_final: bool },
126 /// A chunk of Kokoro TTS audio (f32 LE PCM, 24 kHz mono). The phone
127 /// accumulates chunks until `done: true`, then plays them back.
128 /// `base64` is the standard base64 encoding of the raw bytes.
129 AudioChunk {
130 base64: String,
131 sample_rate: u32,
132 done: bool,
133 },
134 /// Response to a `ChannelQuery`. `op` echoes the request; `payload` is a
135 /// JSON array (channel summaries for "list", events for "events").
136 ChannelData {
137 op: String,
138 payload: serde_json::Value,
139 },
140}
141
142// ── Enrollment proof (proto ≥ 2) ─────────────────────────────────────────────
143//
144// The phone proves possession of the single-use pairing token WITHOUT
145// transmitting it: HMAC-SHA256(token, transcript). The token is the HMAC key,
146// never part of the signed bytes and never on the wire — closing the on-path
147// LAN sniff on the TLS-less WebSocket. All primitives live here (the only crate
148// with the crypto deps); the daemon and SDK call these free functions.
149
150/// Domain tag for the enrollment transcript (version-bound for anti-downgrade).
151const PAIR_DOMAIN: &[u8] = b"mur-pair-v2";
152/// Per-direction role tags: a daemon→phone confirm can't be replayed as a
153/// phone→daemon proof (reflection defense).
154pub const PAIR_ROLE_PHONE_TO_DAEMON: &[u8] = b"mur-pair-v2/phone->daemon";
155pub const PAIR_ROLE_DAEMON_TO_PHONE: &[u8] = b"mur-pair-v2/daemon->phone";
156
157/// Build the enrollment transcript bound to every field that scopes the proof,
158/// using FIXED-length (u32_le) length-prefixing so no field boundary is
159/// ambiguous (mirrors `bridge::envelope::signing_bytes`). The token is NOT
160/// included — it is the HMAC key in [`pair_proof`], never in the signed bytes.
161pub fn pair_transcript(
162 role: &[u8],
163 proto: u32,
164 agent: &str,
165 wid: &str,
166 did: &str,
167 phone_pubkey: &str,
168 nonce: &[u8],
169) -> Vec<u8> {
170 fn put(out: &mut Vec<u8>, field: &[u8]) {
171 out.extend_from_slice(&(field.len() as u32).to_le_bytes());
172 out.extend_from_slice(field);
173 }
174 let mut out = Vec::new();
175 put(&mut out, PAIR_DOMAIN);
176 put(&mut out, role);
177 out.extend_from_slice(&proto.to_le_bytes());
178 put(&mut out, agent.as_bytes());
179 put(&mut out, wid.as_bytes());
180 put(&mut out, did.as_bytes());
181 put(&mut out, phone_pubkey.as_bytes());
182 put(&mut out, nonce);
183 out
184}
185
186/// HMAC-SHA256(token, transcript) — full 256-bit output, never truncated.
187pub fn pair_proof(token: &[u8], transcript: &[u8]) -> [u8; 32] {
188 use hmac::{Hmac, Mac};
189 let mut mac = <Hmac<sha2::Sha256> as Mac>::new_from_slice(token)
190 .expect("HMAC accepts a key of any length");
191 mac.update(transcript);
192 mac.finalize().into_bytes().into()
193}
194
195/// Constant-time byte-slice equality (length is not secret; contents compared
196/// in constant time so a partial match doesn't leak via timing).
197pub fn ct_verify(a: &[u8], b: &[u8]) -> bool {
198 use subtle::ConstantTimeEq;
199 a.len() == b.len() && a.ct_eq(b).into()
200}
201
202/// A fresh 32-byte liveness nonce for the enrollment challenge (OS CSPRNG).
203pub fn mint_nonce() -> [u8; 32] {
204 use rand_core::RngCore;
205 let mut n = [0u8; 32];
206 rand_core::OsRng.fill_bytes(&mut n);
207 n
208}
209
210#[cfg(test)]
211mod tests {
212 use super::*;
213
214 #[test]
215 fn channel_query_and_data_frames_round_trip() {
216 let q = ClientFrame::ChannelQuery {
217 op: "events".into(),
218 channel_id: Some("c1".into()),
219 since_seq: Some(3),
220 };
221 let s = serde_json::to_string(&q).unwrap();
222 assert!(s.contains("\"type\":\"channel_query\""));
223 let back: ClientFrame = serde_json::from_str(&s).unwrap();
224 matches!(back, ClientFrame::ChannelQuery { .. });
225
226 let d = ServerFrame::ChannelData {
227 op: "list".into(),
228 payload: serde_json::json!([]),
229 };
230 let s2 = serde_json::to_string(&d).unwrap();
231 assert!(s2.contains("\"type\":\"channel_data\""));
232 }
233
234 #[test]
235 fn pair_proof_round_trips_and_rejects_wrong_token() {
236 let token = b"the-122-bit-token";
237 let t = pair_transcript(
238 PAIR_ROLE_PHONE_TO_DAEMON,
239 2,
240 "mur",
241 "wid-1",
242 "did-1",
243 "zPHONE",
244 &[7u8; 32],
245 );
246 let proof = pair_proof(token, &t);
247 // Same token + transcript verifies.
248 assert!(ct_verify(&proof, &pair_proof(token, &t)));
249 // A different token produces a different proof.
250 assert!(!ct_verify(&proof, &pair_proof(b"wrong-token", &t)));
251 }
252
253 #[test]
254 fn transcript_is_unambiguous_and_direction_tagged() {
255 // Field boundaries are length-prefixed: moving a byte across a boundary
256 // changes the transcript (no concatenation ambiguity).
257 let a = pair_transcript(
258 PAIR_ROLE_PHONE_TO_DAEMON,
259 2,
260 "mur",
261 "w",
262 "d",
263 "p",
264 &[0u8; 32],
265 );
266 let b = pair_transcript(
267 PAIR_ROLE_PHONE_TO_DAEMON,
268 2,
269 "mu",
270 "rw",
271 "d",
272 "p",
273 &[0u8; 32],
274 );
275 assert_ne!(a, b, "length-prefixing prevents field-boundary collisions");
276
277 // The two directions produce different transcripts under the same fields,
278 // so a daemon→phone confirm can't be replayed as a phone→daemon proof.
279 let phone = pair_transcript(
280 PAIR_ROLE_PHONE_TO_DAEMON,
281 2,
282 "mur",
283 "w",
284 "d",
285 "p",
286 &[1u8; 32],
287 );
288 let daemon = pair_transcript(
289 PAIR_ROLE_DAEMON_TO_PHONE,
290 2,
291 "mur",
292 "w",
293 "d",
294 "p",
295 &[1u8; 32],
296 );
297 assert_ne!(phone, daemon, "per-direction role tags differ");
298 let tok = b"k";
299 assert!(!ct_verify(
300 &pair_proof(tok, &phone),
301 &pair_proof(tok, &daemon)
302 ));
303 }
304
305 #[test]
306 fn mint_nonce_is_fresh() {
307 assert_ne!(mint_nonce(), mint_nonce(), "nonces must not repeat");
308 }
309
310 #[test]
311 fn enrollment_frames_never_carry_the_token() {
312 // The proof is KEYED by the token, but the token itself must never appear
313 // in any frame the phone sends — that is the whole point of the handshake.
314 let token = "SECRET-TOKEN-do-not-leak-0000";
315 let t = pair_transcript(
316 PAIR_ROLE_PHONE_TO_DAEMON,
317 2,
318 "mur",
319 "wid",
320 "did",
321 "zPHONE",
322 &[0u8; 32],
323 );
324 let proof = pair_proof(token.as_bytes(), &t);
325 let frames = [
326 serde_json::to_string(&ClientFrame::HelloInit {
327 proto: 2,
328 agent: "mur".into(),
329 pubkey: "zPHONE".into(),
330 wid: "wid".into(),
331 })
332 .unwrap(),
333 serde_json::to_string(&ClientFrame::HelloProof {
334 wid: "wid".into(),
335 proof: proof.to_vec(),
336 })
337 .unwrap(),
338 ];
339 for f in frames {
340 assert!(!f.contains(token), "token leaked into a wire frame: {f}");
341 }
342 }
343}