Skip to main content

mur_common/
mobile.rs

1//! Wire protocol for the MUR mobile app ↔ Mac daemon WebSocket endpoint.
2//!
3//! Shared by `mur-mobile-sdk` (the phone client) and `mur-daemon` (the Mac
4//! endpoint) so both ends agree on the framing. Every message is one JSON
5//! object sent as a WebSocket *text* frame. The first client frame is a
6//! [`ClientFrame::Hello`] pairing handshake; once the server replies
7//! [`ServerFrame::Paired`], application traffic is carried as Ed25519-signed
8//! [`SignedEnvelope`]s whose `payload` is the canonical JSON of an A2A
9//! `JsonRpcRequest` — the same crypto MUR uses for agent↔agent bridge traffic.
10//!
11//! P3 adds a voice streaming path: the phone streams raw 16 kHz mono f32 PCM
12//! chunks, the Mac runs whisper.cpp for an authoritative transcript, then
13//! replies with Kokoro TTS audio chunks.
14//!
15//! Design: `docs/superpowers/specs/2026-06-05-mur-voice-mobile-app-design.md`.
16
17use crate::bridge::envelope::SignedEnvelope;
18use serde::{Deserialize, Serialize};
19
20/// WebSocket path the daemon's mobile endpoint serves.
21pub const MOBILE_WS_PATH: &str = "/api/v1/mobile/ws";
22
23/// A2A method (carried inside a signed [`ClientFrame::Envelope`]) by which the
24/// phone authoritatively responds to a HITL gate (v4c). It rides the signed
25/// envelope path — NOT a plain frame — so the daemon verifies the phone's
26/// Ed25519 signature before writing the gate-releasing `HitlResponse`.
27pub const HITL_RESPOND_METHOD: &str = "channel/hitl_respond";
28
29/// Frames the phone sends to the Mac endpoint.
30#[derive(Debug, Serialize, Deserialize)]
31#[serde(tag = "type", rename_all = "snake_case")]
32pub enum ClientFrame {
33    /// Pairing / auth handshake. `token` is the one-time token from the QR
34    /// code; `pubkey` is the phone's multibase Ed25519 public key, which the
35    /// Mac records as a paired device on success. `agent` is the canonical
36    /// agent name the phone wants to talk to (e.g. `"mur"`).
37    Hello {
38        pubkey: String,
39        token: String,
40        agent: String,
41    },
42    /// Enrollment step 1 (proto ≥ 2): the phone announces intent WITHOUT sending
43    /// the token. `wid` is the pairing-window id from the QR; `pubkey` is the
44    /// phone's Ed25519 key to enroll. The daemon replies [`ServerFrame::PairChallenge`].
45    HelloInit {
46        proto: u32,
47        agent: String,
48        pubkey: String,
49        wid: String,
50    },
51    /// Enrollment step 3: proof of token possession. `proof` is
52    /// HMAC-SHA256(token, transcript) — the token itself is never transmitted.
53    HelloProof { wid: String, proof: Vec<u8> },
54    /// Reconnect by paired KEY, no enrollment token (the steady-state auth). The
55    /// phone announces its `pubkey`; the daemon replies [`ServerFrame::Challenge`]
56    /// with a fresh per-connection nonce, which the phone signs back in a
57    /// [`ClientFrame::ResumeProof`]. Distinct from `Hello` so single-use
58    /// enrollment windows never gate reconnects.
59    Resume { pubkey: String, agent: String },
60    /// Proof for a `Resume`: a [`SignedEnvelope`] whose `payload` is exactly the
61    /// challenge nonce bytes, signed by the device key. The daemon-issued nonce
62    /// makes a captured proof non-replayable on a new connection.
63    ResumeProof { envelope: SignedEnvelope },
64    /// A signed A2A request destined for the agent (text-only path).
65    Envelope { envelope: SignedEnvelope },
66    /// Phone begins a voice utterance. The Mac clears its audio accumulator and
67    /// prepares for incoming chunks at `sample_rate` Hz, mono f32.
68    AudioStreamStart { sample_rate: u32 },
69    /// One chunk of raw PCM (f32 LE, `sample_rate` Hz mono, standard base64).
70    /// Authenticated by the connection (paired at `Hello`); no per-chunk sig.
71    AudioChunk { data: String },
72    /// Phone finished speaking. Mac should run STT on the accumulated audio,
73    /// forward to the agent, and stream TTS audio back.
74    AudioStreamEnd,
75    /// Pull channel data. `op` ∈ "list" | "events". For "events", `channel_id`
76    /// is required and `since_seq` (inclusive) enables catch-up. Authenticated
77    /// by the paired connection (like the audio frames).
78    ChannelQuery {
79        op: String,
80        #[serde(default)]
81        channel_id: Option<String>,
82        #[serde(default)]
83        since_seq: Option<u64>,
84    },
85}
86
87/// Frames the Mac endpoint sends back to the phone.
88#[derive(Debug, Serialize, Deserialize)]
89#[serde(tag = "type", rename_all = "snake_case")]
90pub enum ServerFrame {
91    /// Handshake accepted; the phone is now paired with `agent`. For a proof
92    /// enrollment (proto ≥ 2), `confirm` is HMAC-SHA256(token, daemon→phone
93    /// transcript) — the phone verifies it to authenticate the daemon (closes
94    /// rogue-daemon/relay MITM). Empty for legacy Hello / Resume paths.
95    Paired {
96        agent: String,
97        #[serde(default)]
98        confirm: Vec<u8>,
99    },
100    /// Enrollment step 2 (proto ≥ 2): challenge for a [`ClientFrame::HelloInit`].
101    /// `nonce` is a fresh 32-byte liveness value; `did` is the daemon agent's
102    /// Ed25519 identity (multibase), which the phone cross-checks against the QR
103    /// to bind the endpoint on the TLS-less LAN.
104    PairChallenge {
105        wid: String,
106        nonce: Vec<u8>,
107        did: String,
108    },
109    /// The handshake or a later frame was rejected.
110    Rejected { reason: String },
111    /// Challenge issued in reply to a [`ClientFrame::Resume`]: a fresh
112    /// per-connection nonce the phone must sign (in a `ResumeProof`) to prove it
113    /// holds the paired device key.
114    Challenge { nonce: String },
115    /// An asynchronous event mirrored to the phone. `name` is dot-namespaced
116    /// (`mobile.transcript`, `mobile.reply`, …) to match the Hub `EventBus`
117    /// names used for desktop mirroring.
118    Event {
119        name: String,
120        payload: serde_json::Value,
121    },
122    /// Mac whisper.cpp authoritative transcript for the user's just-spoken
123    /// utterance. Overrides the on-device SFSpeech partial. `is_final: true`
124    /// means this is the definitive text for this turn.
125    Transcript { text: String, is_final: bool },
126    /// A chunk of Kokoro TTS audio (f32 LE PCM, 24 kHz mono). The phone
127    /// accumulates chunks until `done: true`, then plays them back.
128    /// `base64` is the standard base64 encoding of the raw bytes.
129    AudioChunk {
130        base64: String,
131        sample_rate: u32,
132        done: bool,
133    },
134    /// Response to a `ChannelQuery`. `op` echoes the request; `payload` is a
135    /// JSON array (channel summaries for "list", events for "events").
136    ChannelData {
137        op: String,
138        payload: serde_json::Value,
139    },
140}
141
142// ── Enrollment proof (proto ≥ 2) ─────────────────────────────────────────────
143//
144// The phone proves possession of the single-use pairing token WITHOUT
145// transmitting it: HMAC-SHA256(token, transcript). The token is the HMAC key,
146// never part of the signed bytes and never on the wire — closing the on-path
147// LAN sniff on the TLS-less WebSocket. All primitives live here (the only crate
148// with the crypto deps); the daemon and SDK call these free functions.
149
150/// Domain tag for the enrollment transcript (version-bound for anti-downgrade).
151const PAIR_DOMAIN: &[u8] = b"mur-pair-v2";
152/// Per-direction role tags: a daemon→phone confirm can't be replayed as a
153/// phone→daemon proof (reflection defense).
154pub const PAIR_ROLE_PHONE_TO_DAEMON: &[u8] = b"mur-pair-v2/phone->daemon";
155pub const PAIR_ROLE_DAEMON_TO_PHONE: &[u8] = b"mur-pair-v2/daemon->phone";
156
157/// Build the enrollment transcript bound to every field that scopes the proof,
158/// using FIXED-length (u32_le) length-prefixing so no field boundary is
159/// ambiguous (mirrors `bridge::envelope::signing_bytes`). The token is NOT
160/// included — it is the HMAC key in [`pair_proof`], never in the signed bytes.
161pub fn pair_transcript(
162    role: &[u8],
163    proto: u32,
164    agent: &str,
165    wid: &str,
166    did: &str,
167    phone_pubkey: &str,
168    nonce: &[u8],
169) -> Vec<u8> {
170    fn put(out: &mut Vec<u8>, field: &[u8]) {
171        out.extend_from_slice(&(field.len() as u32).to_le_bytes());
172        out.extend_from_slice(field);
173    }
174    let mut out = Vec::new();
175    put(&mut out, PAIR_DOMAIN);
176    put(&mut out, role);
177    out.extend_from_slice(&proto.to_le_bytes());
178    put(&mut out, agent.as_bytes());
179    put(&mut out, wid.as_bytes());
180    put(&mut out, did.as_bytes());
181    put(&mut out, phone_pubkey.as_bytes());
182    put(&mut out, nonce);
183    out
184}
185
186/// HMAC-SHA256(token, transcript) — full 256-bit output, never truncated.
187pub fn pair_proof(token: &[u8], transcript: &[u8]) -> [u8; 32] {
188    use hmac::{Hmac, Mac};
189    let mut mac = <Hmac<sha2::Sha256> as Mac>::new_from_slice(token)
190        .expect("HMAC accepts a key of any length");
191    mac.update(transcript);
192    mac.finalize().into_bytes().into()
193}
194
195/// Constant-time byte-slice equality (length is not secret; contents compared
196/// in constant time so a partial match doesn't leak via timing).
197pub fn ct_verify(a: &[u8], b: &[u8]) -> bool {
198    use subtle::ConstantTimeEq;
199    a.len() == b.len() && a.ct_eq(b).into()
200}
201
202/// A fresh 32-byte liveness nonce for the enrollment challenge (OS CSPRNG).
203pub fn mint_nonce() -> [u8; 32] {
204    use rand_core::RngCore;
205    let mut n = [0u8; 32];
206    rand_core::OsRng.fill_bytes(&mut n);
207    n
208}
209
210#[cfg(test)]
211mod tests {
212    use super::*;
213
214    #[test]
215    fn channel_query_and_data_frames_round_trip() {
216        let q = ClientFrame::ChannelQuery {
217            op: "events".into(),
218            channel_id: Some("c1".into()),
219            since_seq: Some(3),
220        };
221        let s = serde_json::to_string(&q).unwrap();
222        assert!(s.contains("\"type\":\"channel_query\""));
223        let back: ClientFrame = serde_json::from_str(&s).unwrap();
224        matches!(back, ClientFrame::ChannelQuery { .. });
225
226        let d = ServerFrame::ChannelData {
227            op: "list".into(),
228            payload: serde_json::json!([]),
229        };
230        let s2 = serde_json::to_string(&d).unwrap();
231        assert!(s2.contains("\"type\":\"channel_data\""));
232    }
233
234    #[test]
235    fn pair_proof_round_trips_and_rejects_wrong_token() {
236        let token = b"the-122-bit-token";
237        let t = pair_transcript(
238            PAIR_ROLE_PHONE_TO_DAEMON,
239            2,
240            "mur",
241            "wid-1",
242            "did-1",
243            "zPHONE",
244            &[7u8; 32],
245        );
246        let proof = pair_proof(token, &t);
247        // Same token + transcript verifies.
248        assert!(ct_verify(&proof, &pair_proof(token, &t)));
249        // A different token produces a different proof.
250        assert!(!ct_verify(&proof, &pair_proof(b"wrong-token", &t)));
251    }
252
253    #[test]
254    fn transcript_is_unambiguous_and_direction_tagged() {
255        // Field boundaries are length-prefixed: moving a byte across a boundary
256        // changes the transcript (no concatenation ambiguity).
257        let a = pair_transcript(
258            PAIR_ROLE_PHONE_TO_DAEMON,
259            2,
260            "mur",
261            "w",
262            "d",
263            "p",
264            &[0u8; 32],
265        );
266        let b = pair_transcript(
267            PAIR_ROLE_PHONE_TO_DAEMON,
268            2,
269            "mu",
270            "rw",
271            "d",
272            "p",
273            &[0u8; 32],
274        );
275        assert_ne!(a, b, "length-prefixing prevents field-boundary collisions");
276
277        // The two directions produce different transcripts under the same fields,
278        // so a daemon→phone confirm can't be replayed as a phone→daemon proof.
279        let phone = pair_transcript(
280            PAIR_ROLE_PHONE_TO_DAEMON,
281            2,
282            "mur",
283            "w",
284            "d",
285            "p",
286            &[1u8; 32],
287        );
288        let daemon = pair_transcript(
289            PAIR_ROLE_DAEMON_TO_PHONE,
290            2,
291            "mur",
292            "w",
293            "d",
294            "p",
295            &[1u8; 32],
296        );
297        assert_ne!(phone, daemon, "per-direction role tags differ");
298        let tok = b"k";
299        assert!(!ct_verify(
300            &pair_proof(tok, &phone),
301            &pair_proof(tok, &daemon)
302        ));
303    }
304
305    #[test]
306    fn mint_nonce_is_fresh() {
307        assert_ne!(mint_nonce(), mint_nonce(), "nonces must not repeat");
308    }
309
310    #[test]
311    fn enrollment_frames_never_carry_the_token() {
312        // The proof is KEYED by the token, but the token itself must never appear
313        // in any frame the phone sends — that is the whole point of the handshake.
314        let token = "SECRET-TOKEN-do-not-leak-0000";
315        let t = pair_transcript(
316            PAIR_ROLE_PHONE_TO_DAEMON,
317            2,
318            "mur",
319            "wid",
320            "did",
321            "zPHONE",
322            &[0u8; 32],
323        );
324        let proof = pair_proof(token.as_bytes(), &t);
325        let frames = [
326            serde_json::to_string(&ClientFrame::HelloInit {
327                proto: 2,
328                agent: "mur".into(),
329                pubkey: "zPHONE".into(),
330                wid: "wid".into(),
331            })
332            .unwrap(),
333            serde_json::to_string(&ClientFrame::HelloProof {
334                wid: "wid".into(),
335                proof: proof.to_vec(),
336            })
337            .unwrap(),
338        ];
339        for f in frames {
340            assert!(!f.contains(token), "token leaked into a wire frame: {f}");
341        }
342    }
343}