use derive_more::Display;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Display)]
#[display("{}", self.as_str())]
pub enum LatencyProfile {
Fast,
#[default]
Balanced,
Quality,
}
impl LatencyProfile {
pub const fn as_str(self) -> &'static str {
match self {
LatencyProfile::Fast => "low_latency",
LatencyProfile::Balanced => "balanced",
LatencyProfile::Quality => "high_quality",
}
}
#[inline(always)]
#[must_use]
pub fn is_fast(self) -> bool {
matches!(self, LatencyProfile::Fast)
}
#[inline(always)]
#[must_use]
pub fn is_balanced(self) -> bool {
matches!(self, LatencyProfile::Balanced)
}
#[inline(always)]
#[must_use]
pub fn is_quality(self) -> bool {
matches!(self, LatencyProfile::Quality)
}
pub const fn default_transcription_delay_ms(self) -> u32 {
match self {
LatencyProfile::Fast => 240,
LatencyProfile::Balanced => 480,
LatencyProfile::Quality => 960,
}
}
pub const fn default_tts_streaming_interval(self) -> f32 {
match self {
LatencyProfile::Fast => 0.24,
LatencyProfile::Balanced => 0.32,
LatencyProfile::Quality => 0.48,
}
}
}
#[derive(Debug, Clone)]
pub struct VoicePipelineConfig {
input_sample_rate: u32,
output_sample_rate: Option<u32>,
input_channels: u16,
frame_duration_ms: u32,
latency_profile: LatencyProfile,
stt_model: String,
stt_transcription_delay_ms: Option<u32>,
stt_max_decode_tokens_per_step: u32,
stt_max_turn_tokens: u32,
stt_finalization_max_steps: u32,
vad_model: String,
vad_start_threshold: f32,
vad_stop_threshold: f32,
vad_start_frames: u32,
vad_end_silence_ms: u32,
vad_max_turn_seconds: f32,
preroll_ms: u32,
turn_model: String,
turn_threshold: f32,
turn_max_incomplete_silence_ms: u32,
response_model: String,
system_prompt: String,
tts_model: String,
tts_voice: String,
tts_streaming_interval: Option<f32>,
tts_temperature: Option<f32>,
barge_in: bool,
min_barge_in_ms: u32,
ignore_playback_echo_ms: u32,
echo_delay_min_ms: u32,
echo_delay_max_ms: u32,
echo_correlation_step_ms: u32,
barge_in_min_transcript_chars: u32,
play_audio: bool,
queue_size: usize,
verbose: bool,
}
impl VoicePipelineConfig {
#[must_use]
pub fn new() -> Self {
Self {
input_sample_rate: 16_000,
output_sample_rate: None,
input_channels: 1,
frame_duration_ms: 32,
latency_profile: LatencyProfile::default(),
stt_model: "mlx-community/Voxtral-Mini-4B-Realtime-2602-4bit".to_string(),
stt_transcription_delay_ms: None,
stt_max_decode_tokens_per_step: 6,
stt_max_turn_tokens: 256,
stt_finalization_max_steps: 96,
vad_model: "mlx-community/silero-vad".to_string(),
vad_start_threshold: 0.35,
vad_stop_threshold: 0.2,
vad_start_frames: 1,
vad_end_silence_ms: 600,
vad_max_turn_seconds: 30.0,
preroll_ms: 250,
turn_model: "mlx-community/smart-turn-v3".to_string(),
turn_threshold: 0.5,
turn_max_incomplete_silence_ms: 1600,
response_model: "mlx-community/NVIDIA-Nemotron-3-Nano-30B-A3B-4bit".to_string(),
system_prompt: "You are a helpful voice assistant. Respond in natural spoken sentences. \
Never use markdown, emoji, or lists."
.to_string(),
tts_model: "mlx-community/pocket-tts".to_string(),
tts_voice: "cosette".to_string(),
tts_streaming_interval: None,
tts_temperature: None,
barge_in: true,
min_barge_in_ms: 180,
ignore_playback_echo_ms: 450,
echo_delay_min_ms: 250,
echo_delay_max_ms: 500,
echo_correlation_step_ms: 32,
barge_in_min_transcript_chars: 2,
play_audio: true,
queue_size: 128,
verbose: false,
}
}
#[inline(always)]
#[must_use]
pub fn input_sample_rate(&self) -> u32 {
self.input_sample_rate
}
#[inline(always)]
#[must_use]
pub fn output_sample_rate(&self) -> Option<u32> {
self.output_sample_rate
}
#[inline(always)]
#[must_use]
pub fn input_channels(&self) -> u16 {
self.input_channels
}
#[inline(always)]
#[must_use]
pub fn frame_duration_ms(&self) -> u32 {
self.frame_duration_ms
}
#[inline(always)]
#[must_use]
pub fn latency_profile(&self) -> LatencyProfile {
self.latency_profile
}
#[inline(always)]
#[must_use]
pub fn stt_model(&self) -> &str {
&self.stt_model
}
#[inline(always)]
#[must_use]
pub fn stt_transcription_delay_ms(&self) -> Option<u32> {
self.stt_transcription_delay_ms
}
#[inline(always)]
#[must_use]
pub fn stt_max_decode_tokens_per_step(&self) -> u32 {
self.stt_max_decode_tokens_per_step
}
#[inline(always)]
#[must_use]
pub fn stt_max_turn_tokens(&self) -> u32 {
self.stt_max_turn_tokens
}
#[inline(always)]
#[must_use]
pub fn stt_finalization_max_steps(&self) -> u32 {
self.stt_finalization_max_steps
}
#[inline(always)]
#[must_use]
pub fn vad_model(&self) -> &str {
&self.vad_model
}
#[inline(always)]
#[must_use]
pub fn vad_start_threshold(&self) -> f32 {
self.vad_start_threshold
}
#[inline(always)]
#[must_use]
pub fn vad_stop_threshold(&self) -> f32 {
self.vad_stop_threshold
}
#[inline(always)]
#[must_use]
pub fn vad_start_frames(&self) -> u32 {
self.vad_start_frames
}
#[inline(always)]
#[must_use]
pub fn vad_end_silence_ms(&self) -> u32 {
self.vad_end_silence_ms
}
#[inline(always)]
#[must_use]
pub fn vad_max_turn_seconds(&self) -> f32 {
self.vad_max_turn_seconds
}
#[inline(always)]
#[must_use]
pub fn preroll_ms(&self) -> u32 {
self.preroll_ms
}
#[inline(always)]
#[must_use]
pub fn turn_model(&self) -> &str {
&self.turn_model
}
#[inline(always)]
#[must_use]
pub fn turn_threshold(&self) -> f32 {
self.turn_threshold
}
#[inline(always)]
#[must_use]
pub fn turn_max_incomplete_silence_ms(&self) -> u32 {
self.turn_max_incomplete_silence_ms
}
#[inline(always)]
#[must_use]
pub fn response_model(&self) -> &str {
&self.response_model
}
#[inline(always)]
#[must_use]
pub fn system_prompt(&self) -> &str {
&self.system_prompt
}
#[inline(always)]
#[must_use]
pub fn tts_model(&self) -> &str {
&self.tts_model
}
#[inline(always)]
#[must_use]
pub fn tts_voice(&self) -> &str {
&self.tts_voice
}
#[inline(always)]
#[must_use]
pub fn tts_streaming_interval(&self) -> Option<f32> {
self.tts_streaming_interval
}
#[inline(always)]
#[must_use]
pub fn tts_temperature(&self) -> Option<f32> {
self.tts_temperature
}
#[inline(always)]
#[must_use]
pub fn barge_in(&self) -> bool {
self.barge_in
}
#[inline(always)]
#[must_use]
pub fn min_barge_in_ms(&self) -> u32 {
self.min_barge_in_ms
}
#[inline(always)]
#[must_use]
pub fn ignore_playback_echo_ms(&self) -> u32 {
self.ignore_playback_echo_ms
}
#[inline(always)]
#[must_use]
pub fn echo_delay_min_ms(&self) -> u32 {
self.echo_delay_min_ms
}
#[inline(always)]
#[must_use]
pub fn echo_delay_max_ms(&self) -> u32 {
self.echo_delay_max_ms
}
#[inline(always)]
#[must_use]
pub fn echo_correlation_step_ms(&self) -> u32 {
self.echo_correlation_step_ms
}
#[inline(always)]
#[must_use]
pub fn barge_in_min_transcript_chars(&self) -> u32 {
self.barge_in_min_transcript_chars
}
#[inline(always)]
#[must_use]
pub fn play_audio(&self) -> bool {
self.play_audio
}
#[inline(always)]
#[must_use]
pub fn queue_size(&self) -> usize {
self.queue_size
}
#[inline(always)]
#[must_use]
pub fn verbose(&self) -> bool {
self.verbose
}
#[must_use]
pub fn with_input_sample_rate(mut self, v: u32) -> Self {
self.input_sample_rate = v;
self
}
#[must_use]
pub fn with_output_sample_rate(mut self, v: Option<u32>) -> Self {
self.output_sample_rate = v;
self
}
#[must_use]
pub fn with_input_channels(mut self, v: u16) -> Self {
self.input_channels = v;
self
}
#[must_use]
pub fn with_frame_duration_ms(mut self, v: u32) -> Self {
self.frame_duration_ms = v;
self
}
#[must_use]
pub fn with_latency_profile(mut self, v: LatencyProfile) -> Self {
self.latency_profile = v;
self
}
#[must_use]
pub fn with_stt_model(mut self, v: impl Into<String>) -> Self {
self.stt_model = v.into();
self
}
#[must_use]
pub fn with_stt_transcription_delay_ms(mut self, v: Option<u32>) -> Self {
self.stt_transcription_delay_ms = v;
self
}
#[must_use]
pub fn with_stt_max_decode_tokens_per_step(mut self, v: u32) -> Self {
self.stt_max_decode_tokens_per_step = v;
self
}
#[must_use]
pub fn with_stt_max_turn_tokens(mut self, v: u32) -> Self {
self.stt_max_turn_tokens = v;
self
}
#[must_use]
pub fn with_stt_finalization_max_steps(mut self, v: u32) -> Self {
self.stt_finalization_max_steps = v;
self
}
#[must_use]
pub fn with_vad_model(mut self, v: impl Into<String>) -> Self {
self.vad_model = v.into();
self
}
#[must_use]
pub fn with_vad_start_threshold(mut self, v: f32) -> Self {
self.vad_start_threshold = v;
self
}
#[must_use]
pub fn with_vad_stop_threshold(mut self, v: f32) -> Self {
self.vad_stop_threshold = v;
self
}
#[must_use]
pub fn with_vad_start_frames(mut self, v: u32) -> Self {
self.vad_start_frames = v;
self
}
#[must_use]
pub fn with_vad_end_silence_ms(mut self, v: u32) -> Self {
self.vad_end_silence_ms = v;
self
}
#[must_use]
pub fn with_vad_max_turn_seconds(mut self, v: f32) -> Self {
self.vad_max_turn_seconds = v;
self
}
#[must_use]
pub fn with_preroll_ms(mut self, v: u32) -> Self {
self.preroll_ms = v;
self
}
#[must_use]
pub fn with_turn_model(mut self, v: impl Into<String>) -> Self {
self.turn_model = v.into();
self
}
#[must_use]
pub fn with_turn_threshold(mut self, v: f32) -> Self {
self.turn_threshold = v;
self
}
#[must_use]
pub fn with_turn_max_incomplete_silence_ms(mut self, v: u32) -> Self {
self.turn_max_incomplete_silence_ms = v;
self
}
#[must_use]
pub fn with_response_model(mut self, v: impl Into<String>) -> Self {
self.response_model = v.into();
self
}
#[must_use]
pub fn with_system_prompt(mut self, v: impl Into<String>) -> Self {
self.system_prompt = v.into();
self
}
#[must_use]
pub fn with_tts_model(mut self, v: impl Into<String>) -> Self {
self.tts_model = v.into();
self
}
#[must_use]
pub fn with_tts_voice(mut self, v: impl Into<String>) -> Self {
self.tts_voice = v.into();
self
}
#[must_use]
pub fn with_tts_streaming_interval(mut self, v: Option<f32>) -> Self {
self.tts_streaming_interval = v;
self
}
#[must_use]
pub fn with_tts_temperature(mut self, v: Option<f32>) -> Self {
self.tts_temperature = v;
self
}
#[must_use]
pub fn with_barge_in(mut self, v: bool) -> Self {
self.barge_in = v;
self
}
#[must_use]
pub fn with_min_barge_in_ms(mut self, v: u32) -> Self {
self.min_barge_in_ms = v;
self
}
#[must_use]
pub fn with_ignore_playback_echo_ms(mut self, v: u32) -> Self {
self.ignore_playback_echo_ms = v;
self
}
#[must_use]
pub fn with_echo_delay_min_ms(mut self, v: u32) -> Self {
self.echo_delay_min_ms = v;
self
}
#[must_use]
pub fn with_echo_delay_max_ms(mut self, v: u32) -> Self {
self.echo_delay_max_ms = v;
self
}
#[must_use]
pub fn with_echo_correlation_step_ms(mut self, v: u32) -> Self {
self.echo_correlation_step_ms = v;
self
}
#[must_use]
pub fn with_barge_in_min_transcript_chars(mut self, v: u32) -> Self {
self.barge_in_min_transcript_chars = v;
self
}
#[must_use]
pub fn with_play_audio(mut self, v: bool) -> Self {
self.play_audio = v;
self
}
#[must_use]
pub fn with_queue_size(mut self, v: usize) -> Self {
self.queue_size = v;
self
}
#[must_use]
pub fn with_verbose(mut self, v: bool) -> Self {
self.verbose = v;
self
}
#[must_use]
pub fn resolved(mut self) -> Self {
if self.stt_transcription_delay_ms.is_none() {
self.stt_transcription_delay_ms = Some(self.latency_profile.default_transcription_delay_ms());
}
if self.tts_streaming_interval.is_none() {
self.tts_streaming_interval = Some(self.latency_profile.default_tts_streaming_interval());
}
self
}
pub fn resolved_transcription_delay_ms(&self) -> u32 {
self
.stt_transcription_delay_ms
.unwrap_or_else(|| self.latency_profile.default_transcription_delay_ms())
}
pub fn resolved_tts_streaming_interval(&self) -> f32 {
self
.tts_streaming_interval
.unwrap_or_else(|| self.latency_profile.default_tts_streaming_interval())
}
}
impl Default for VoicePipelineConfig {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn defaults_match_python_dataclass() {
let cfg = VoicePipelineConfig::new();
assert_eq!(cfg.input_sample_rate(), 16_000);
assert_eq!(cfg.output_sample_rate(), None);
assert_eq!(cfg.input_channels(), 1);
assert_eq!(cfg.frame_duration_ms(), 32);
assert_eq!(cfg.latency_profile(), LatencyProfile::Balanced);
assert_eq!(
cfg.stt_model(),
"mlx-community/Voxtral-Mini-4B-Realtime-2602-4bit"
);
assert_eq!(cfg.stt_transcription_delay_ms(), None);
assert_eq!(cfg.stt_max_decode_tokens_per_step(), 6);
assert_eq!(cfg.stt_max_turn_tokens(), 256);
assert_eq!(cfg.stt_finalization_max_steps(), 96);
assert_eq!(cfg.vad_model(), "mlx-community/silero-vad");
assert!((cfg.vad_start_threshold() - 0.35).abs() < 1e-6);
assert!((cfg.vad_stop_threshold() - 0.2).abs() < 1e-6);
assert_eq!(cfg.vad_start_frames(), 1);
assert_eq!(cfg.vad_end_silence_ms(), 600);
assert!((cfg.vad_max_turn_seconds() - 30.0).abs() < 1e-6);
assert_eq!(cfg.preroll_ms(), 250);
assert_eq!(cfg.turn_model(), "mlx-community/smart-turn-v3");
assert!((cfg.turn_threshold() - 0.5).abs() < 1e-6);
assert_eq!(cfg.turn_max_incomplete_silence_ms(), 1600);
assert_eq!(
cfg.response_model(),
"mlx-community/NVIDIA-Nemotron-3-Nano-30B-A3B-4bit"
);
assert!(cfg.system_prompt().contains("voice assistant"));
assert_eq!(cfg.tts_model(), "mlx-community/pocket-tts");
assert_eq!(cfg.tts_voice(), "cosette");
assert_eq!(cfg.tts_streaming_interval(), None);
assert_eq!(cfg.tts_temperature(), None);
assert!(cfg.barge_in());
assert_eq!(cfg.min_barge_in_ms(), 180);
assert_eq!(cfg.ignore_playback_echo_ms(), 450);
assert_eq!(cfg.echo_delay_min_ms(), 250);
assert_eq!(cfg.echo_delay_max_ms(), 500);
assert_eq!(cfg.echo_correlation_step_ms(), 32);
assert_eq!(cfg.barge_in_min_transcript_chars(), 2);
assert!(cfg.play_audio());
assert_eq!(cfg.queue_size(), 128);
assert!(!cfg.verbose());
}
#[test]
fn resolved_fills_profile_defaults() {
let cfg = VoicePipelineConfig::new().resolved();
assert_eq!(cfg.stt_transcription_delay_ms(), Some(480));
assert_eq!(cfg.tts_streaming_interval(), Some(0.32));
let cfg = VoicePipelineConfig::new()
.with_latency_profile(LatencyProfile::Fast)
.resolved();
assert_eq!(cfg.stt_transcription_delay_ms(), Some(240));
assert_eq!(cfg.tts_streaming_interval(), Some(0.24));
let cfg = VoicePipelineConfig::new()
.with_latency_profile(LatencyProfile::Quality)
.resolved();
assert_eq!(cfg.stt_transcription_delay_ms(), Some(960));
assert_eq!(cfg.tts_streaming_interval(), Some(0.48));
}
#[test]
fn resolved_preserves_explicit_overrides() {
let cfg = VoicePipelineConfig::new()
.with_latency_profile(LatencyProfile::Fast)
.with_stt_transcription_delay_ms(Some(123))
.with_tts_streaming_interval(Some(0.07))
.resolved();
assert_eq!(cfg.stt_transcription_delay_ms(), Some(123));
assert_eq!(cfg.tts_streaming_interval(), Some(0.07));
}
#[test]
fn resolved_accessors_agree_with_resolved_method() {
let raw = VoicePipelineConfig::new().with_latency_profile(LatencyProfile::Quality);
let folded = raw.clone().resolved();
assert_eq!(
raw.resolved_transcription_delay_ms(),
folded.stt_transcription_delay_ms().unwrap()
);
assert!(
(raw.resolved_tts_streaming_interval() - folded.tts_streaming_interval().unwrap()).abs()
< 1e-6
);
}
#[test]
fn latency_profile_as_str() {
assert_eq!(LatencyProfile::Fast.as_str(), "low_latency");
assert_eq!(LatencyProfile::Balanced.as_str(), "balanced");
assert_eq!(LatencyProfile::Quality.as_str(), "high_quality");
}
#[test]
fn latency_profile_display() {
assert_eq!(LatencyProfile::Fast.to_string(), "low_latency");
assert_eq!(LatencyProfile::Balanced.to_string(), "balanced");
assert_eq!(LatencyProfile::Quality.to_string(), "high_quality");
}
#[test]
fn latency_profile_is_variant_predicates() {
assert!(LatencyProfile::Fast.is_fast());
assert!(!LatencyProfile::Fast.is_balanced());
assert!(!LatencyProfile::Fast.is_quality());
assert!(!LatencyProfile::Balanced.is_fast());
assert!(LatencyProfile::Balanced.is_balanced());
assert!(!LatencyProfile::Balanced.is_quality());
assert!(!LatencyProfile::Quality.is_fast());
assert!(!LatencyProfile::Quality.is_balanced());
assert!(LatencyProfile::Quality.is_quality());
}
#[test]
fn builder_chain_sets_fields() {
let cfg = VoicePipelineConfig::new()
.with_input_sample_rate(8_000)
.with_output_sample_rate(Some(24_000))
.with_input_channels(2)
.with_frame_duration_ms(16)
.with_latency_profile(LatencyProfile::Fast)
.with_stt_model("my-stt")
.with_stt_transcription_delay_ms(Some(100))
.with_stt_max_decode_tokens_per_step(3)
.with_stt_max_turn_tokens(128)
.with_stt_finalization_max_steps(48)
.with_vad_model("my-vad")
.with_vad_start_threshold(0.5)
.with_vad_stop_threshold(0.3)
.with_vad_start_frames(2)
.with_vad_end_silence_ms(300)
.with_vad_max_turn_seconds(15.0)
.with_preroll_ms(100)
.with_turn_model("my-turn")
.with_turn_threshold(0.7)
.with_turn_max_incomplete_silence_ms(800)
.with_response_model("my-llm")
.with_system_prompt("be concise")
.with_tts_model("my-tts")
.with_tts_voice("alice")
.with_tts_streaming_interval(Some(0.1))
.with_tts_temperature(Some(0.8))
.with_barge_in(false)
.with_min_barge_in_ms(90)
.with_ignore_playback_echo_ms(200)
.with_echo_delay_min_ms(100)
.with_echo_delay_max_ms(300)
.with_echo_correlation_step_ms(16)
.with_barge_in_min_transcript_chars(5)
.with_play_audio(false)
.with_queue_size(64)
.with_verbose(true);
assert_eq!(cfg.input_sample_rate(), 8_000);
assert_eq!(cfg.output_sample_rate(), Some(24_000));
assert_eq!(cfg.input_channels(), 2);
assert_eq!(cfg.frame_duration_ms(), 16);
assert_eq!(cfg.latency_profile(), LatencyProfile::Fast);
assert_eq!(cfg.stt_model(), "my-stt");
assert_eq!(cfg.stt_transcription_delay_ms(), Some(100));
assert_eq!(cfg.stt_max_decode_tokens_per_step(), 3);
assert_eq!(cfg.stt_max_turn_tokens(), 128);
assert_eq!(cfg.stt_finalization_max_steps(), 48);
assert_eq!(cfg.vad_model(), "my-vad");
assert!((cfg.vad_start_threshold() - 0.5).abs() < 1e-6);
assert!((cfg.vad_stop_threshold() - 0.3).abs() < 1e-6);
assert_eq!(cfg.vad_start_frames(), 2);
assert_eq!(cfg.vad_end_silence_ms(), 300);
assert!((cfg.vad_max_turn_seconds() - 15.0).abs() < 1e-6);
assert_eq!(cfg.preroll_ms(), 100);
assert_eq!(cfg.turn_model(), "my-turn");
assert!((cfg.turn_threshold() - 0.7).abs() < 1e-6);
assert_eq!(cfg.turn_max_incomplete_silence_ms(), 800);
assert_eq!(cfg.response_model(), "my-llm");
assert_eq!(cfg.system_prompt(), "be concise");
assert_eq!(cfg.tts_model(), "my-tts");
assert_eq!(cfg.tts_voice(), "alice");
assert_eq!(cfg.tts_streaming_interval(), Some(0.1));
assert_eq!(cfg.tts_temperature(), Some(0.8));
assert!(!cfg.barge_in());
assert_eq!(cfg.min_barge_in_ms(), 90);
assert_eq!(cfg.ignore_playback_echo_ms(), 200);
assert_eq!(cfg.echo_delay_min_ms(), 100);
assert_eq!(cfg.echo_delay_max_ms(), 300);
assert_eq!(cfg.echo_correlation_step_ms(), 16);
assert_eq!(cfg.barge_in_min_transcript_chars(), 5);
assert!(!cfg.play_audio());
assert_eq!(cfg.queue_size(), 64);
assert!(cfg.verbose());
}
}