1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
//! Wire protocol for the MUR mobile app ↔ Mac daemon WebSocket endpoint.
//!
//! Shared by `mur-mobile-sdk` (the phone client) and `mur-daemon` (the Mac
//! endpoint) so both ends agree on the framing. Every message is one JSON
//! object sent as a WebSocket *text* frame. The first client frame is a
//! [`ClientFrame::Hello`] pairing handshake; once the server replies
//! [`ServerFrame::Paired`], application traffic is carried as Ed25519-signed
//! [`SignedEnvelope`]s whose `payload` is the canonical JSON of an A2A
//! `JsonRpcRequest` — the same crypto MUR uses for agent↔agent bridge traffic.
//!
//! P3 adds a voice streaming path: the phone streams raw 16 kHz mono f32 PCM
//! chunks, the Mac runs whisper.cpp for an authoritative transcript, then
//! replies with Kokoro TTS audio chunks.
//!
//! Design: `docs/superpowers/specs/2026-06-05-mur-voice-mobile-app-design.md`.
use crate::bridge::envelope::SignedEnvelope;
use serde::{Deserialize, Serialize};
/// WebSocket path the daemon's mobile endpoint serves.
pub const MOBILE_WS_PATH: &str = "/api/v1/mobile/ws";
/// Frames the phone sends to the Mac endpoint.
#[derive(Debug, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum ClientFrame {
/// Pairing / auth handshake. `token` is the one-time token from the QR
/// code; `pubkey` is the phone's multibase Ed25519 public key, which the
/// Mac records as a paired device on success. `agent` is the canonical
/// agent name the phone wants to talk to (e.g. `"mur"`).
Hello {
pubkey: String,
token: String,
agent: String,
},
/// A signed A2A request destined for the agent (text-only path).
Envelope { envelope: SignedEnvelope },
/// Phone begins a voice utterance. The Mac clears its audio accumulator and
/// prepares for incoming chunks at `sample_rate` Hz, mono f32.
AudioStreamStart { sample_rate: u32 },
/// One chunk of raw PCM (f32 LE, `sample_rate` Hz mono, standard base64).
/// Authenticated by the connection (paired at `Hello`); no per-chunk sig.
AudioChunk { data: String },
/// Phone finished speaking. Mac should run STT on the accumulated audio,
/// forward to the agent, and stream TTS audio back.
AudioStreamEnd,
}
/// Frames the Mac endpoint sends back to the phone.
#[derive(Debug, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum ServerFrame {
/// Handshake accepted; the phone is now paired with `agent`.
Paired { agent: String },
/// The handshake or a later frame was rejected.
Rejected { reason: String },
/// An asynchronous event mirrored to the phone. `name` is dot-namespaced
/// (`mobile.transcript`, `mobile.reply`, …) to match the Hub `EventBus`
/// names used for desktop mirroring.
Event {
name: String,
payload: serde_json::Value,
},
/// Mac whisper.cpp authoritative transcript for the user's just-spoken
/// utterance. Overrides the on-device SFSpeech partial. `is_final: true`
/// means this is the definitive text for this turn.
Transcript { text: String, is_final: bool },
/// A chunk of Kokoro TTS audio (f32 LE PCM, 24 kHz mono). The phone
/// accumulates chunks until `done: true`, then plays them back.
/// `base64` is the standard base64 encoding of the raw bytes.
AudioChunk {
base64: String,
sample_rate: u32,
done: bool,
},
}