Skip to main content

clawft_types/config/
voice.rs

1//! Voice pipeline configuration types (W-VOICE workstream).
2//!
3//! Defines the full voice pipeline config: audio capture, STT, TTS,
4//! voice activity detection, wake word, and cloud fallback.
5
6use std::collections::HashMap;
7
8use serde::{Deserialize, Serialize};
9
10use super::personality::VoicePersonality;
11
12/// Voice pipeline configuration.
13#[derive(Debug, Clone, Default, Serialize, Deserialize)]
14pub struct VoiceConfig {
15    /// Enable voice features globally.
16    #[serde(default)]
17    pub enabled: bool,
18
19    /// Audio capture settings.
20    #[serde(default)]
21    pub audio: AudioConfig,
22
23    /// Speech-to-text settings.
24    #[serde(default)]
25    pub stt: SttConfig,
26
27    /// Text-to-speech settings.
28    #[serde(default)]
29    pub tts: TtsConfig,
30
31    /// Voice activity detection settings.
32    #[serde(default)]
33    pub vad: VadConfig,
34
35    /// Wake word detection settings.
36    #[serde(default)]
37    pub wake: WakeConfig,
38
39    /// Cloud fallback settings.
40    #[serde(default, alias = "cloudFallback")]
41    pub cloud_fallback: CloudFallbackConfig,
42
43    /// Per-agent voice personality map.
44    ///
45    /// Keys are agent names/IDs, values are their voice personalities.
46    /// Agents not in this map use the default personality.
47    #[serde(default, alias = "personalities")]
48    pub personalities: HashMap<String, VoicePersonality>,
49}
50
51/// Audio capture/playback configuration.
52#[derive(Debug, Clone, Serialize, Deserialize)]
53pub struct AudioConfig {
54    /// Sample rate in Hz.
55    #[serde(default = "default_sample_rate", alias = "sampleRate")]
56    pub sample_rate: u32,
57
58    /// Audio chunk size in samples.
59    #[serde(default = "default_chunk_size", alias = "chunkSize")]
60    pub chunk_size: u32,
61
62    /// Number of audio channels (1 = mono).
63    #[serde(default = "default_audio_channels")]
64    pub channels: u16,
65
66    /// Input device name (None = system default).
67    #[serde(default, alias = "inputDevice")]
68    pub input_device: Option<String>,
69
70    /// Output device name (None = system default).
71    #[serde(default, alias = "outputDevice")]
72    pub output_device: Option<String>,
73}
74
75fn default_sample_rate() -> u32 {
76    16000
77}
78fn default_chunk_size() -> u32 {
79    512
80}
81fn default_audio_channels() -> u16 {
82    1
83}
84
85impl Default for AudioConfig {
86    fn default() -> Self {
87        Self {
88            sample_rate: default_sample_rate(),
89            chunk_size: default_chunk_size(),
90            channels: default_audio_channels(),
91            input_device: None,
92            output_device: None,
93        }
94    }
95}
96
97/// Speech-to-text configuration.
98#[derive(Debug, Clone, Serialize, Deserialize)]
99pub struct SttConfig {
100    /// Enable STT.
101    #[serde(default = "super::default_true")]
102    pub enabled: bool,
103
104    /// STT model name or path.
105    #[serde(default = "default_stt_model")]
106    pub model: String,
107
108    /// Language code (e.g. "en", "zh", "es"). Empty = auto-detect.
109    #[serde(default)]
110    pub language: String,
111}
112
113fn default_stt_model() -> String {
114    "sherpa-onnx-streaming-zipformer-en-20M".into()
115}
116
117impl Default for SttConfig {
118    fn default() -> Self {
119        Self {
120            enabled: true,
121            model: default_stt_model(),
122            language: String::new(),
123        }
124    }
125}
126
127/// Text-to-speech configuration.
128#[derive(Debug, Clone, Serialize, Deserialize)]
129pub struct TtsConfig {
130    /// Enable TTS.
131    #[serde(default = "super::default_true")]
132    pub enabled: bool,
133
134    /// TTS provider: "browser" (Web Speech API), "openai", or "elevenlabs".
135    #[serde(default = "default_tts_provider")]
136    pub provider: String,
137
138    /// TTS model name or path.
139    #[serde(default = "default_tts_model")]
140    pub model: String,
141
142    /// TTS voice ID.
143    #[serde(default)]
144    pub voice: String,
145
146    /// Speaking speed multiplier (1.0 = normal).
147    #[serde(default = "default_speed")]
148    pub speed: f32,
149}
150
151fn default_tts_provider() -> String {
152    "browser".into()
153}
154fn default_tts_model() -> String {
155    "vits-piper-en_US-amy-medium".into()
156}
157fn default_speed() -> f32 {
158    1.0
159}
160
161impl Default for TtsConfig {
162    fn default() -> Self {
163        Self {
164            enabled: true,
165            provider: default_tts_provider(),
166            model: default_tts_model(),
167            voice: String::new(),
168            speed: default_speed(),
169        }
170    }
171}
172
173/// Voice activity detection configuration.
174#[derive(Debug, Clone, Serialize, Deserialize)]
175pub struct VadConfig {
176    /// VAD activation threshold (0.0-1.0).
177    #[serde(default = "default_vad_threshold")]
178    pub threshold: f32,
179
180    /// Silence duration in ms before speech end.
181    #[serde(default = "default_silence_timeout_ms", alias = "silenceTimeoutMs")]
182    pub silence_timeout_ms: u32,
183
184    /// Minimum speech duration in ms to trigger processing.
185    #[serde(default = "default_min_speech_ms", alias = "minSpeechMs")]
186    pub min_speech_ms: u32,
187}
188
189fn default_vad_threshold() -> f32 {
190    0.5
191}
192fn default_silence_timeout_ms() -> u32 {
193    1500
194}
195fn default_min_speech_ms() -> u32 {
196    250
197}
198
199impl Default for VadConfig {
200    fn default() -> Self {
201        Self {
202            threshold: default_vad_threshold(),
203            silence_timeout_ms: default_silence_timeout_ms(),
204            min_speech_ms: default_min_speech_ms(),
205        }
206    }
207}
208
209/// Wake word detection configuration.
210#[derive(Debug, Clone, Serialize, Deserialize)]
211pub struct WakeConfig {
212    /// Enable wake word detection.
213    #[serde(default)]
214    pub enabled: bool,
215
216    /// Wake word phrase (e.g. "hey weft").
217    #[serde(default = "default_wake_phrase")]
218    pub phrase: String,
219
220    /// Detection sensitivity (0.0-1.0).
221    #[serde(default = "default_wake_sensitivity")]
222    pub sensitivity: f32,
223
224    /// Custom wake word model path.
225    #[serde(default, alias = "modelPath")]
226    pub model_path: Option<String>,
227}
228
229fn default_wake_phrase() -> String {
230    "hey weft".into()
231}
232fn default_wake_sensitivity() -> f32 {
233    0.5
234}
235
236impl Default for WakeConfig {
237    fn default() -> Self {
238        Self {
239            enabled: false,
240            phrase: default_wake_phrase(),
241            sensitivity: default_wake_sensitivity(),
242            model_path: None,
243        }
244    }
245}
246
247/// Cloud STT/TTS fallback configuration.
248#[derive(Debug, Clone, Default, Serialize, Deserialize)]
249pub struct CloudFallbackConfig {
250    /// Enable cloud fallback when local models fail.
251    #[serde(default)]
252    pub enabled: bool,
253
254    /// Cloud STT provider ("whisper" for OpenAI Whisper API).
255    #[serde(default, alias = "sttProvider")]
256    pub stt_provider: String,
257
258    /// Cloud TTS provider ("elevenlabs" or "openai").
259    #[serde(default, alias = "ttsProvider")]
260    pub tts_provider: String,
261}