Skip to main content

brainwires_audio/
types.rs

1use serde::{Deserialize, Serialize};
2
3pub(crate) const SAMPLE_RATE_SPEECH: u32 = 16_000;
4pub(crate) const SAMPLE_RATE_CD: u32 = 44_100;
5pub(crate) const SAMPLE_RATE_HIGH_QUALITY: u32 = 48_000;
6
7/// Supported audio sample formats.
8#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
9pub enum SampleFormat {
10    /// 16-bit signed integer PCM (most common for speech).
11    I16,
12    /// 32-bit floating point PCM.
13    F32,
14}
15
16/// Audio stream configuration.
17#[derive(Debug, Clone, Serialize, Deserialize)]
18pub struct AudioConfig {
19    /// Sample rate in Hz (e.g., 16000, 44100, 48000).
20    pub sample_rate: u32,
21    /// Number of audio channels (1 = mono, 2 = stereo).
22    pub channels: u16,
23    /// Sample format.
24    pub sample_format: SampleFormat,
25}
26
27impl AudioConfig {
28    /// Standard speech config: 16kHz mono 16-bit (Whisper, most STT APIs).
29    pub fn speech() -> Self {
30        Self {
31            sample_rate: SAMPLE_RATE_SPEECH,
32            channels: 1,
33            sample_format: SampleFormat::I16,
34        }
35    }
36
37    /// CD quality: 44.1kHz stereo 16-bit.
38    pub fn cd_quality() -> Self {
39        Self {
40            sample_rate: SAMPLE_RATE_CD,
41            channels: 2,
42            sample_format: SampleFormat::I16,
43        }
44    }
45
46    /// High quality: 48kHz stereo float.
47    pub fn high_quality() -> Self {
48        Self {
49            sample_rate: SAMPLE_RATE_HIGH_QUALITY,
50            channels: 2,
51            sample_format: SampleFormat::F32,
52        }
53    }
54
55    /// Bytes per sample for this format.
56    pub fn bytes_per_sample(&self) -> usize {
57        match self.sample_format {
58            SampleFormat::I16 => 2,
59            SampleFormat::F32 => 4,
60        }
61    }
62
63    /// Bytes per frame (one sample per channel).
64    pub fn bytes_per_frame(&self) -> usize {
65        self.bytes_per_sample() * self.channels as usize
66    }
67}
68
69impl Default for AudioConfig {
70    fn default() -> Self {
71        Self::speech()
72    }
73}
74
75/// A chunk of raw audio data with its format metadata.
76#[derive(Debug, Clone)]
77pub struct AudioBuffer {
78    /// Raw PCM sample data (little-endian).
79    pub data: Vec<u8>,
80    /// Audio configuration describing the format of `data`.
81    pub config: AudioConfig,
82}
83
84impl AudioBuffer {
85    /// Create a new empty buffer with the given config.
86    pub fn new(config: AudioConfig) -> Self {
87        Self {
88            data: Vec::new(),
89            config,
90        }
91    }
92
93    /// Create a buffer from raw PCM bytes.
94    pub fn from_pcm(data: Vec<u8>, config: AudioConfig) -> Self {
95        Self { data, config }
96    }
97
98    /// Duration of the audio in seconds.
99    pub fn duration_secs(&self) -> f64 {
100        let frame_size = self.config.bytes_per_frame();
101        if frame_size == 0 {
102            return 0.0;
103        }
104        let num_frames = self.data.len() / frame_size;
105        num_frames as f64 / self.config.sample_rate as f64
106    }
107
108    /// Number of frames in this buffer.
109    pub fn num_frames(&self) -> usize {
110        let frame_size = self.config.bytes_per_frame();
111        if frame_size == 0 {
112            0
113        } else {
114            self.data.len() / frame_size
115        }
116    }
117
118    /// Whether this buffer contains no audio data.
119    pub fn is_empty(&self) -> bool {
120        self.data.is_empty()
121    }
122}
123
124/// Voice identifier for TTS.
125#[derive(Debug, Clone, Serialize, Deserialize)]
126pub struct Voice {
127    /// Provider-specific voice identifier (e.g., "alloy", "echo", "shimmer").
128    pub id: String,
129    /// Human-readable display name.
130    pub name: Option<String>,
131    /// Language code (e.g., "en-US").
132    pub language: Option<String>,
133}
134
135impl Voice {
136    /// Create a new voice with the given identifier.
137    pub fn new(id: impl Into<String>) -> Self {
138        Self {
139            id: id.into(),
140            name: None,
141            language: None,
142        }
143    }
144}
145
146/// Output audio format for TTS.
147#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
148pub enum OutputFormat {
149    /// WAV format.
150    Wav,
151    /// MP3 format.
152    Mp3,
153    /// Raw PCM samples.
154    Pcm,
155    /// Opus compressed format.
156    Opus,
157    /// FLAC lossless compressed format.
158    Flac,
159}
160
161/// Options for text-to-speech generation.
162#[derive(Debug, Clone, Serialize, Deserialize)]
163pub struct TtsOptions {
164    /// Voice to use.
165    pub voice: Voice,
166    /// Playback speed multiplier (0.25 to 4.0, default 1.0).
167    pub speed: Option<f32>,
168    /// Output audio format.
169    pub output_format: OutputFormat,
170}
171
172impl Default for TtsOptions {
173    fn default() -> Self {
174        Self {
175            voice: Voice::new("alloy"),
176            speed: None,
177            output_format: OutputFormat::Wav,
178        }
179    }
180}
181
182/// Options for speech-to-text transcription.
183#[derive(Debug, Clone, Serialize, Deserialize, Default)]
184pub struct SttOptions {
185    /// Language hint (ISO-639-1 code, e.g., "en").
186    pub language: Option<String>,
187    /// Whether to include word-level timestamps.
188    pub timestamps: bool,
189    /// Optional prompt to guide the model.
190    pub prompt: Option<String>,
191}
192
193/// Result of a speech-to-text transcription.
194#[derive(Debug, Clone, Serialize, Deserialize)]
195pub struct Transcript {
196    /// The transcribed text.
197    pub text: String,
198    /// Language detected or used.
199    pub language: Option<String>,
200    /// Duration of the audio in seconds.
201    pub duration_secs: Option<f64>,
202    /// Word-level segments with timestamps (if requested).
203    pub segments: Vec<TranscriptSegment>,
204}
205
206/// A timed segment within a transcript.
207#[derive(Debug, Clone, Serialize, Deserialize)]
208pub struct TranscriptSegment {
209    /// Segment text.
210    pub text: String,
211    /// Start time in seconds.
212    pub start: f64,
213    /// End time in seconds.
214    pub end: f64,
215}
216
217#[cfg(test)]
218mod tests {
219    use super::*;
220
221    #[test]
222    fn speech_config_values() {
223        let cfg = AudioConfig::speech();
224        assert_eq!(cfg.sample_rate, 16000);
225        assert_eq!(cfg.channels, 1);
226        assert_eq!(cfg.sample_format, SampleFormat::I16);
227    }
228
229    #[test]
230    fn cd_quality_config_values() {
231        let cfg = AudioConfig::cd_quality();
232        assert_eq!(cfg.sample_rate, 44100);
233        assert_eq!(cfg.channels, 2);
234        assert_eq!(cfg.sample_format, SampleFormat::I16);
235    }
236
237    #[test]
238    fn high_quality_config_values() {
239        let cfg = AudioConfig::high_quality();
240        assert_eq!(cfg.sample_rate, 48000);
241        assert_eq!(cfg.channels, 2);
242        assert_eq!(cfg.sample_format, SampleFormat::F32);
243    }
244
245    #[test]
246    fn bytes_per_sample_i16() {
247        let cfg = AudioConfig::speech(); // I16
248        assert_eq!(cfg.bytes_per_sample(), 2);
249    }
250
251    #[test]
252    fn bytes_per_sample_f32() {
253        let cfg = AudioConfig::high_quality(); // F32
254        assert_eq!(cfg.bytes_per_sample(), 4);
255    }
256
257    #[test]
258    fn bytes_per_frame_mono_i16() {
259        let cfg = AudioConfig::speech(); // 1 channel, I16
260        assert_eq!(cfg.bytes_per_frame(), 2); // 2 * 1
261    }
262
263    #[test]
264    fn bytes_per_frame_stereo_f32() {
265        let cfg = AudioConfig::high_quality(); // 2 channels, F32
266        assert_eq!(cfg.bytes_per_frame(), 8); // 4 * 2
267    }
268
269    #[test]
270    fn audio_buffer_new_is_empty() {
271        let buf = AudioBuffer::new(AudioConfig::speech());
272        assert!(buf.is_empty());
273        assert_eq!(buf.num_frames(), 0);
274    }
275
276    #[test]
277    fn audio_buffer_from_pcm_stores_data() {
278        let data = vec![0u8; 64];
279        let cfg = AudioConfig::speech();
280        let buf = AudioBuffer::from_pcm(data.clone(), cfg);
281        assert_eq!(buf.data, data);
282        assert_eq!(buf.config.sample_rate, 16000);
283        assert!(!buf.is_empty());
284    }
285
286    #[test]
287    fn audio_buffer_num_frames() {
288        // 16kHz mono I16 => 2 bytes per frame
289        // 100 bytes => 50 frames
290        let buf = AudioBuffer::from_pcm(vec![0u8; 100], AudioConfig::speech());
291        assert_eq!(buf.num_frames(), 50);
292    }
293
294    #[test]
295    fn audio_buffer_duration_secs() {
296        // 16kHz mono I16 => 2 bytes/frame
297        // 32000 bytes => 16000 frames => 1.0 second
298        let buf = AudioBuffer::from_pcm(vec![0u8; 32000], AudioConfig::speech());
299        assert!((buf.duration_secs() - 1.0).abs() < f64::EPSILON);
300    }
301
302    #[test]
303    fn voice_new_sets_id_and_defaults() {
304        let v = Voice::new("shimmer");
305        assert_eq!(v.id, "shimmer");
306        assert!(v.name.is_none());
307        assert!(v.language.is_none());
308    }
309
310    #[test]
311    fn output_format_debug_is_reasonable() {
312        let dbg = format!("{:?}", OutputFormat::Wav);
313        assert_eq!(dbg, "Wav");
314        let dbg_mp3 = format!("{:?}", OutputFormat::Mp3);
315        assert_eq!(dbg_mp3, "Mp3");
316    }
317}