car-voice 0.13.0

Voice I/O capability for CAR — mic capture, VAD, listener/speaker traits
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
//! Voice subsystem configuration.

use serde::{Deserialize, Serialize};
use std::sync::Arc;
use tokio::sync::watch;

/// Default voice-context prompt overlay. Prepended to the system prompt
/// of voice-invoked inference calls so the model emits short,
/// voice-appropriate responses without further coaxing. Verbose by
/// design — quip's production canaries showed this exact shape works
/// where shorter prompts didn't.
///
/// Override with [`VoiceConfig::voice_prompt_overlay`]; an explicit
/// empty string disables the overlay entirely. See
/// `docs/proposals/voice-sidecar-orchestration.md` §"Voice-context
/// prompt overlay".
pub const DEFAULT_VOICE_PROMPT_OVERLAY: &str =
    "[VOICE CONTEXT: This is a real-time voice call. Speed is critical — act immediately.
When checking email, get ALL emails in the inbox (not just unread — set unreadOnly to false).
The most important emails are ones that were read but never replied to or acted on.
Skip newsletters, marketing emails, and automated notifications.
Return ONLY subject lines and sender names. Do NOT include email bodies.
Focus on personal/work emails from real people that likely need a response.
When checking calendar, limit to the next 2 weeks and list only time, title, and attendees.
Do NOT ask clarifying questions — use sensible defaults and act.
Keep responses under 500 characters. The user is waiting on a live call.]";

/// How the listener decides when to record speech.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ListenerMode {
    /// Always-on listening with VAD-driven turn detection.
    /// Voice-first GUIs default to this.
    Auto,
    /// Manual control: caller drives `start_segment` / `end_segment`.
    PushToTalk,
    /// Always-on but only emits transcripts after a wake word is heard.
    WakeWord,
}

/// Which TTS backend to use.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum TtsProvider {
    /// ElevenLabs cloud TTS (requires API key).
    Elevenlabs,
    /// Local OpenAI-compatible TTS server (e.g. mlx-audio, Piper). HTTP
    /// client — requires a separate server (often Python) running.
    Local,
    /// In-process Kokoro-82M TTS via MLX/Metal. No HTTP, no server, no
    /// Python. macOS-only. Model is pulled to the HuggingFace cache on
    /// first use.
    Kokoro,
    /// macOS AVSpeechSynthesizer — Apple's built-in TTS. Free, on-device,
    /// no model download, no MLX dependency. macOS only; non-macOS
    /// targets reject this variant in `provider::build_tts_speaker`.
    AppleSpeech,
}

impl Default for TtsProvider {
    fn default() -> Self {
        // On macOS, prefer Apple's built-in AVSpeechSynthesizer — free,
        // on-device, no model download, no Kokoro/MLX setup. Kokoro is
        // still available as an explicit opt-in for users who prefer its
        // voice character. Non-macOS targets fall back to ElevenLabs;
        // none of the local TTS paths run cross-platform without setup.
        #[cfg(target_os = "macos")]
        {
            Self::AppleSpeech
        }
        #[cfg(not(target_os = "macos"))]
        {
            Self::Elevenlabs
        }
    }
}

/// Which STT backend to use.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum SttProvider {
    /// ElevenLabs Scribe cloud STT (requires API key).
    Elevenlabs,
    /// In-process Whisper via whisper.cpp. Runs entirely on-device
    /// with Metal acceleration on Apple Silicon. No HTTP, no server,
    /// no Python — model file is downloaded once and cached at
    /// `~/.tokhn/whisper/`.
    WhisperCpp,
    /// In-process Parakeet TDT via ONNX Runtime. Faster than whisper
    /// on Apple Silicon and emits per-token timestamps natively for
    /// cleaner streaming UX. Requires the `parakeet` cargo feature
    /// (pulls in `ort` + `ndarray`); model files (~600 MB) downloaded
    /// once to `~/.car/models/parakeet-tdt-0.6b-v2-int8/`.
    Parakeet,
    /// macOS SFSpeechRecognizer — Apple's built-in STT. On-device, free,
    /// multilingual, no model download. Requires Speech Recognition
    /// permission (host calls SFSpeechRecognizer.requestAuthorization at
    /// startup). macOS only; non-macOS targets reject this variant.
    AppleSpeech,
}

impl Default for SttProvider {
    fn default() -> Self {
        // On macOS, prefer Apple's built-in SFSpeechRecognizer — free,
        // on-device, no model download. whisper.cpp stays as an explicit
        // opt-in (and remains the cross-platform default elsewhere).
        #[cfg(target_os = "macos")]
        {
            Self::AppleSpeech
        }
        #[cfg(not(target_os = "macos"))]
        {
            Self::WhisperCpp
        }
    }
}

impl Default for ListenerMode {
    fn default() -> Self {
        Self::Auto
    }
}

/// Configuration for the voice subsystem.
///
/// Loaded from `tokhn-config` (or env vars / defaults). Channels do not
/// re-derive any of this — they pass it to [`crate::Listener::start`].
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct VoiceConfig {
    // ─── Provider selection ────────────────────────────────────────────────
    /// STT provider.
    #[serde(default)]
    pub stt_provider: SttProvider,

    /// TTS provider.
    #[serde(default)]
    pub tts_provider: TtsProvider,

    /// ElevenLabs API key. Resolved from `ELEVENLABS_API_KEY` env var if not
    /// set explicitly.
    #[serde(default)]
    pub elevenlabs_api_key: Option<String>,

    /// ElevenLabs voice ID for narration. Defaults to the TARS-style narrator
    /// the Tauri app was using.
    #[serde(default = "default_voice_id")]
    pub elevenlabs_voice_id: String,

    /// ElevenLabs TTS model.
    #[serde(default = "default_tts_model")]
    pub elevenlabs_tts_model: String,

    // ─── Local provider config ────────────────────────────────────────────
    /// Base URL for the local OpenAI-compatible TTS server.
    #[serde(default = "default_local_tts_url")]
    pub local_tts_url: String,

    /// Model name for local TTS (e.g. `"mlx-community/Kokoro-82M-bf16"`).
    #[serde(default = "default_local_tts_model")]
    pub local_tts_model: String,

    /// Whisper model identifier for the in-process whisper.cpp STT
    /// provider. Matches the suffix used by `ggerganov/whisper.cpp`
    /// on Hugging Face — e.g. `"large-v3-turbo-q5_0"`, `"medium-q5_0"`,
    /// `"tiny-q5_0"`. First-run launch downloads
    /// `ggml-<model>.bin` from there and caches it at
    /// `~/.tokhn/whisper/ggml-<model>.bin`.
    #[serde(default = "default_whisper_cpp_model")]
    pub whisper_cpp_model: String,

    /// Voice name for local TTS (provider-specific, e.g. `"af_heart"`).
    #[serde(default = "default_local_tts_voice")]
    pub local_tts_voice: String,

    /// Playback speed multiplier for local TTS.
    #[serde(default = "default_local_tts_speed")]
    pub local_tts_speed: f32,

    /// Sampling temperature for local TTS (Qwen3-TTS).
    #[serde(default = "default_local_tts_temperature")]
    pub local_tts_temperature: f32,

    /// Reference audio path for voice cloning (Qwen3-TTS-Base).
    #[serde(default)]
    pub local_tts_ref_audio: Option<String>,

    /// Reference text for voice cloning (Qwen3-TTS-Base).
    #[serde(default)]
    pub local_tts_ref_text: Option<String>,

    /// Natural language voice description for voice design (Qwen3-TTS-VoiceDesign).
    #[serde(default)]
    pub local_tts_instruct: Option<String>,

    // ─── Audio device ──────────────────────────────────────────────────────
    /// Optional input device name. `None` means use the OS default.
    #[serde(default)]
    pub input_device: Option<String>,

    /// Capture sample rate in Hz. Common values: 16000, 44100, 48000.
    /// 16 kHz mono is what ElevenLabs STT prefers.
    #[serde(default = "default_sample_rate")]
    pub sample_rate: u32,

    /// Language code for STT (e.g. `"en"`, `"es"`).
    #[serde(default = "default_language")]
    pub language: String,

    // ─── Listener UX policy ────────────────────────────────────────────────
    /// Listening mode (auto / push-to-talk / wake-word).
    #[serde(default)]
    pub mode: ListenerMode,

    /// Wake word(s) recognized in `WakeWord` mode.
    #[serde(default = "default_wake_words")]
    pub wake_words: Vec<String>,

    // ─── VAD tuning ────────────────────────────────────────────────────────
    /// VAD energy threshold above the noise floor, in dB. Raised from 9.0
    /// to 15.0 in commit `7033ca3` so the audio bed stops bleeding into the
    /// AirPods microphone.
    #[serde(default = "default_vad_threshold_db")]
    pub vad_threshold_db: f32,

    /// Minimum continuous voiced energy before SpeechStart fires (ms).
    #[serde(default = "default_speech_onset_ms")]
    pub speech_onset_ms: u32,

    /// Silence duration that ends a turn (ms).
    #[serde(default = "default_turn_end_ms")]
    pub turn_end_ms: u32,

    /// IIR smoothing factor for energy estimates (0.0–1.0). Higher = smoother.
    #[serde(default = "default_smoothing_factor")]
    pub smoothing_factor: f32,

    /// Hysteresis margin in dB to prevent rapid speech/silence flapping.
    #[serde(default = "default_hysteresis_db")]
    pub hysteresis_db: f32,

    // ─── Barge-in / capture-loop tuning ───────────────────────────────────
    /// While Tokhn is playing TTS, the VAD threshold is raised by this
    /// many dB so only loud user speech (a barge-in) registers. Lower
    /// = more sensitive to interruption; higher = more echo tolerance.
    #[serde(default = "default_barge_in_boost_db")]
    pub barge_in_boost_db: f32,

    /// Milliseconds to keep the threshold boost active *after* TTS ends,
    /// to absorb speaker tail / room reverb.
    #[serde(default = "default_boost_tail_ms")]
    pub boost_tail_ms: u64,

    /// Hard cap on segment length. Anything longer is force-finalized —
    /// VAD has been seen to lock onto background noise and produce
    /// 17-second "speech" segments that never naturally end.
    #[serde(default = "default_max_segment_ms")]
    pub max_segment_ms: u64,

    /// Reject finalized segments whose average RMS isn't this many dB
    /// above the calibrated noise floor. Drops low-energy noise before
    /// it reaches STT (which then hallucinates content from it).
    #[serde(default = "default_segment_min_snr_db")]
    pub segment_min_snr_db: f32,

    // ─── Voice orchestration ──────────────────────────────────────────────
    /// Voice-context prompt overlay prepended to system prompts on the
    /// voice-invoked inference path. `None` uses
    /// [`DEFAULT_VOICE_PROMPT_OVERLAY`]. An explicit `Some("".into())`
    /// disables the overlay (e.g. for callers who already supply their
    /// own voice-tuned system prompt).
    #[serde(default)]
    pub voice_prompt_overlay: Option<String>,

    /// Progress-phrase interval in seconds. While the sidecar is
    /// still running, play a short "still working on that" phrase
    /// every `progress_interval_secs`. `None` defaults to 8.
    #[serde(default)]
    pub progress_interval_secs: Option<u64>,

    /// Maximum number of progress-phrase attempts before giving up
    /// on the sidecar. `None` defaults to 4 — combined with the
    /// 8-second interval that's a 32-second cap, lower than the
    /// default `sidecar_timeout` so progress is the dominant
    /// abandonment signal.
    #[serde(default)]
    pub max_progress_attempts: Option<u32>,
}

impl VoiceConfig {
    /// Construct a `VoiceConfig` by starting from [`Self::default`] and
    /// applying overrides read from environment variables. Supported
    /// vars (all optional):
    ///
    /// - `TOKHN_STT_PROVIDER` = `"local"` | `"elevenlabs"`
    /// - `TOKHN_STT_URL` — base URL of a local OpenAI-compatible STT
    ///   server (e.g. `http://127.0.0.1:19281/v1`).
    /// - `TOKHN_STT_MODEL` — model id for the local STT provider.
    /// - `TOKHN_TTS_PROVIDER` = `"local"` | `"elevenlabs"`
    /// - `TOKHN_TTS_URL` — local TTS server base URL.
    /// - `TOKHN_TTS_MODEL` — local TTS model id.
    ///
    /// Unknown values for the `*_PROVIDER` vars are ignored (default
    /// stays in effect); callers should verify a known provider is
    /// reachable before relying on it.
    pub fn from_env() -> Self {
        let mut cfg = Self::default();
        if let Ok(v) = std::env::var("TOKHN_STT_PROVIDER") {
            match v.to_lowercase().as_str() {
                // Accept `whisper-cpp`, `whispercpp`, and `local` as
                // aliases — `local` is a historical name that's still
                // in docs/shells, but the provider is in-process
                // whisper.cpp regardless.
                "whisper-cpp" | "whispercpp" | "whisper_cpp" | "local" => {
                    cfg.stt_provider = SttProvider::WhisperCpp;
                }
                "elevenlabs" | "eleven_labs" | "eleven-labs" => {
                    cfg.stt_provider = SttProvider::Elevenlabs;
                }
                "parakeet" | "parakeet-tdt" | "parakeet_tdt" => {
                    cfg.stt_provider = SttProvider::Parakeet;
                }
                "apple_speech" | "apple-speech" | "apple" | "sfspeech" => {
                    cfg.stt_provider = SttProvider::AppleSpeech;
                }
                _ => {}
            }
        }
        if let Ok(v) = std::env::var("TOKHN_STT_MODEL") {
            if !v.is_empty() {
                cfg.whisper_cpp_model = v;
            }
        }
        if let Ok(v) = std::env::var("TOKHN_TTS_PROVIDER") {
            match v.to_lowercase().as_str() {
                "local" => cfg.tts_provider = TtsProvider::Local,
                "kokoro" | "kokoro_native" | "kokoro-native" => {
                    cfg.tts_provider = TtsProvider::Kokoro;
                }
                "elevenlabs" | "eleven_labs" | "eleven-labs" => {
                    cfg.tts_provider = TtsProvider::Elevenlabs;
                }
                "apple_speech" | "apple-speech" | "apple" | "avspeech" => {
                    cfg.tts_provider = TtsProvider::AppleSpeech;
                }
                _ => {}
            }
        }
        if let Ok(v) = std::env::var("TOKHN_TTS_URL") {
            if !v.is_empty() {
                cfg.local_tts_url = v;
            }
        }
        if let Ok(v) = std::env::var("TOKHN_TTS_MODEL") {
            if !v.is_empty() {
                cfg.local_tts_model = v;
            }
        }
        cfg
    }
}

impl Default for VoiceConfig {
    fn default() -> Self {
        Self {
            stt_provider: SttProvider::default(),
            tts_provider: TtsProvider::default(),
            elevenlabs_api_key: None,
            elevenlabs_voice_id: default_voice_id(),
            elevenlabs_tts_model: default_tts_model(),
            local_tts_url: default_local_tts_url(),
            local_tts_model: default_local_tts_model(),
            whisper_cpp_model: default_whisper_cpp_model(),
            local_tts_voice: default_local_tts_voice(),
            local_tts_speed: default_local_tts_speed(),
            local_tts_temperature: default_local_tts_temperature(),
            local_tts_ref_audio: None,
            local_tts_ref_text: None,
            local_tts_instruct: None,
            input_device: None,
            sample_rate: default_sample_rate(),
            language: default_language(),
            mode: ListenerMode::default(),
            wake_words: default_wake_words(),
            vad_threshold_db: default_vad_threshold_db(),
            speech_onset_ms: default_speech_onset_ms(),
            turn_end_ms: default_turn_end_ms(),
            smoothing_factor: default_smoothing_factor(),
            hysteresis_db: default_hysteresis_db(),
            barge_in_boost_db: default_barge_in_boost_db(),
            boost_tail_ms: default_boost_tail_ms(),
            max_segment_ms: default_max_segment_ms(),
            segment_min_snr_db: default_segment_min_snr_db(),
            voice_prompt_overlay: None,
            progress_interval_secs: None,
            max_progress_attempts: None,
        }
    }
}

/// Compose the voice-context overlay with an optional caller system
/// prompt. Returns the combined string suitable for
/// `GenerateRequest.context`.
///
/// Behaviour:
/// - `config.voice_prompt_overlay = None` → use [`DEFAULT_VOICE_PROMPT_OVERLAY`].
/// - `config.voice_prompt_overlay = Some("")` → disable overlay
///   (caller's prompt is returned unchanged; `None` if no caller prompt).
/// - Both overlay and caller prompt present → overlay first, blank line,
///   then caller prompt.
pub fn compose_voice_context(config: &VoiceConfig, caller_context: Option<&str>) -> Option<String> {
    let overlay = config
        .voice_prompt_overlay
        .as_deref()
        .unwrap_or(DEFAULT_VOICE_PROMPT_OVERLAY);
    match (overlay.is_empty(), caller_context) {
        (true, None) => None,
        (true, Some(ctx)) => Some(ctx.to_string()),
        (false, None) => Some(overlay.to_string()),
        (false, Some(ctx)) => Some(format!("{overlay}\n\n{ctx}")),
    }
}

// ─────────────────────────────────────────────────────────────────────
// Hot-reloadable config
// ─────────────────────────────────────────────────────────────────────

/// Sender half of a hot-reloadable voice config.
///
/// The config owner (e.g. a file watcher or UI settings panel) calls
/// [`VoiceConfigSender::update`] when the config changes. All holders
/// of a [`VoiceConfigHandle`] see the new values immediately.
#[derive(Debug, Clone)]
pub struct VoiceConfigSender {
    tx: Arc<watch::Sender<VoiceConfig>>,
}

/// Read-only handle to the current voice config. Cheap to clone —
/// listeners, VAD, and speakers each hold one.
#[derive(Debug, Clone)]
pub struct VoiceConfigHandle {
    rx: watch::Receiver<VoiceConfig>,
}

/// Create a sender/handle pair seeded with the initial config.
pub fn voice_config_watch(initial: VoiceConfig) -> (VoiceConfigSender, VoiceConfigHandle) {
    let (tx, rx) = watch::channel(initial);
    (
        VoiceConfigSender { tx: Arc::new(tx) },
        VoiceConfigHandle { rx },
    )
}

impl VoiceConfigSender {
    /// Push a new config to all handles. Only notifies if the config
    /// actually changed (avoids spurious wake-ups).
    pub fn update(&self, config: VoiceConfig) {
        self.tx.send_if_modified(|current| {
            if *current != config {
                *current = config;
                true
            } else {
                false
            }
        });
    }

    /// Read the current config.
    pub fn current(&self) -> VoiceConfig {
        self.tx.borrow().clone()
    }
}

impl VoiceConfigHandle {
    /// Snapshot the current config.
    pub fn current(&self) -> VoiceConfig {
        self.rx.borrow().clone()
    }

    /// Wait for the config to change, returning the new value. Returns
    /// `None` if the sender is dropped.
    pub async fn changed(&mut self) -> Option<VoiceConfig> {
        self.rx.changed().await.ok()?;
        Some(self.rx.borrow_and_update().clone())
    }
}

fn default_voice_id() -> String {
    // TARS-style narrator voice from app/src/lib/voiceEngine.ts
    "UznIBkKIQe3ZG2tGydre".into()
}

fn default_tts_model() -> String {
    "eleven_turbo_v2_5".into()
}

fn default_sample_rate() -> u32 {
    16_000
}

fn default_language() -> String {
    "en".into()
}

fn default_wake_words() -> Vec<String> {
    vec!["tokhn".into(), "token".into(), "talking".into()]
}

fn default_vad_threshold_db() -> f32 {
    15.0
}

fn default_speech_onset_ms() -> u32 {
    100
}

fn default_turn_end_ms() -> u32 {
    1400
}

fn default_smoothing_factor() -> f32 {
    0.3
}

fn default_hysteresis_db() -> f32 {
    3.0
}

fn default_barge_in_boost_db() -> f32 {
    18.0
}

fn default_boost_tail_ms() -> u64 {
    // 500ms is the sweet spot empirically: covers most macOS speaker
    // tails + near-field room reverb for Tokhn's own speech without
    // making real user responses feel lagged. At 350ms, late echo
    // after longer sentences occasionally slipped through the boosted
    // VAD threshold and started a spurious recording.
    500
}

fn default_max_segment_ms() -> u64 {
    6_000
}

fn default_segment_min_snr_db() -> f32 {
    8.0
}

fn default_local_tts_url() -> String {
    "http://127.0.0.1:19280/v1".into()
}

fn default_local_tts_model() -> String {
    "mlx-community/Kokoro-82M-bf16".into()
}

/// Default whisper.cpp model id. `large-v3-turbo-q5_0` is the
/// quantized turbo variant (~600 MB file, near-best accuracy, runs
/// well above real-time on M-series). Downloads from
/// `https://huggingface.co/ggerganov/whisper.cpp`.
fn default_whisper_cpp_model() -> String {
    "large-v3-turbo-q5_0".into()
}

fn default_local_tts_voice() -> String {
    "af_heart".into()
}

fn default_local_tts_speed() -> f32 {
    1.0
}

fn default_local_tts_temperature() -> f32 {
    0.7
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn compose_voice_context_uses_default_when_unset() {
        let cfg = VoiceConfig::default();
        let out = compose_voice_context(&cfg, None).expect("overlay present by default");
        assert!(out.starts_with("[VOICE CONTEXT:"));
    }

    #[test]
    fn compose_voice_context_empty_overlay_disables() {
        let cfg = VoiceConfig {
            voice_prompt_overlay: Some(String::new()),
            ..VoiceConfig::default()
        };
        assert_eq!(compose_voice_context(&cfg, None), None);
        assert_eq!(
            compose_voice_context(&cfg, Some("hi")),
            Some("hi".to_string())
        );
    }

    #[test]
    fn compose_voice_context_concatenates_with_caller_prompt() {
        let cfg = VoiceConfig {
            voice_prompt_overlay: Some("OVERLAY".into()),
            ..VoiceConfig::default()
        };
        assert_eq!(
            compose_voice_context(&cfg, Some("CALLER")),
            Some("OVERLAY\n\nCALLER".to_string())
        );
    }

    #[test]
    fn compose_voice_context_overlay_only_when_no_caller_prompt() {
        let cfg = VoiceConfig {
            voice_prompt_overlay: Some("OVERLAY".into()),
            ..VoiceConfig::default()
        };
        assert_eq!(
            compose_voice_context(&cfg, None),
            Some("OVERLAY".to_string())
        );
    }
}