1use serde::{Deserialize, Serialize};
2
3pub(crate) const SAMPLE_RATE_SPEECH: u32 = 16_000;
4pub(crate) const SAMPLE_RATE_CD: u32 = 44_100;
5pub(crate) const SAMPLE_RATE_HIGH_QUALITY: u32 = 48_000;
6
7#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
9pub enum SampleFormat {
10 I16,
12 F32,
14}
15
16#[derive(Debug, Clone, Serialize, Deserialize)]
18pub struct AudioConfig {
19 pub sample_rate: u32,
21 pub channels: u16,
23 pub sample_format: SampleFormat,
25}
26
27impl AudioConfig {
28 pub fn speech() -> Self {
30 Self {
31 sample_rate: SAMPLE_RATE_SPEECH,
32 channels: 1,
33 sample_format: SampleFormat::I16,
34 }
35 }
36
37 pub fn cd_quality() -> Self {
39 Self {
40 sample_rate: SAMPLE_RATE_CD,
41 channels: 2,
42 sample_format: SampleFormat::I16,
43 }
44 }
45
46 pub fn high_quality() -> Self {
48 Self {
49 sample_rate: SAMPLE_RATE_HIGH_QUALITY,
50 channels: 2,
51 sample_format: SampleFormat::F32,
52 }
53 }
54
55 pub fn bytes_per_sample(&self) -> usize {
57 match self.sample_format {
58 SampleFormat::I16 => 2,
59 SampleFormat::F32 => 4,
60 }
61 }
62
63 pub fn bytes_per_frame(&self) -> usize {
65 self.bytes_per_sample() * self.channels as usize
66 }
67}
68
69impl Default for AudioConfig {
70 fn default() -> Self {
71 Self::speech()
72 }
73}
74
75#[derive(Debug, Clone)]
77pub struct AudioBuffer {
78 pub data: Vec<u8>,
80 pub config: AudioConfig,
82}
83
84impl AudioBuffer {
85 pub fn new(config: AudioConfig) -> Self {
87 Self {
88 data: Vec::new(),
89 config,
90 }
91 }
92
93 pub fn from_pcm(data: Vec<u8>, config: AudioConfig) -> Self {
95 Self { data, config }
96 }
97
98 pub fn duration_secs(&self) -> f64 {
100 let frame_size = self.config.bytes_per_frame();
101 if frame_size == 0 {
102 return 0.0;
103 }
104 let num_frames = self.data.len() / frame_size;
105 num_frames as f64 / self.config.sample_rate as f64
106 }
107
108 pub fn num_frames(&self) -> usize {
110 let frame_size = self.config.bytes_per_frame();
111 if frame_size == 0 {
112 0
113 } else {
114 self.data.len() / frame_size
115 }
116 }
117
118 pub fn is_empty(&self) -> bool {
120 self.data.is_empty()
121 }
122}
123
124#[derive(Debug, Clone, Serialize, Deserialize)]
126pub struct Voice {
127 pub id: String,
129 pub name: Option<String>,
131 pub language: Option<String>,
133}
134
135impl Voice {
136 pub fn new(id: impl Into<String>) -> Self {
138 Self {
139 id: id.into(),
140 name: None,
141 language: None,
142 }
143 }
144}
145
146#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
148pub enum OutputFormat {
149 Wav,
151 Mp3,
153 Pcm,
155 Opus,
157 Flac,
159}
160
161#[derive(Debug, Clone, Serialize, Deserialize)]
163pub struct TtsOptions {
164 pub voice: Voice,
166 pub speed: Option<f32>,
168 pub output_format: OutputFormat,
170}
171
172impl Default for TtsOptions {
173 fn default() -> Self {
174 Self {
175 voice: Voice::new("alloy"),
176 speed: None,
177 output_format: OutputFormat::Wav,
178 }
179 }
180}
181
182#[derive(Debug, Clone, Serialize, Deserialize, Default)]
184pub struct SttOptions {
185 pub language: Option<String>,
187 pub timestamps: bool,
189 pub prompt: Option<String>,
191}
192
193#[derive(Debug, Clone, Serialize, Deserialize)]
195pub struct Transcript {
196 pub text: String,
198 pub language: Option<String>,
200 pub duration_secs: Option<f64>,
202 pub segments: Vec<TranscriptSegment>,
204}
205
206#[derive(Debug, Clone, Serialize, Deserialize)]
208pub struct TranscriptSegment {
209 pub text: String,
211 pub start: f64,
213 pub end: f64,
215}
216
217#[cfg(test)]
218mod tests {
219 use super::*;
220
221 #[test]
222 fn speech_config_values() {
223 let cfg = AudioConfig::speech();
224 assert_eq!(cfg.sample_rate, 16000);
225 assert_eq!(cfg.channels, 1);
226 assert_eq!(cfg.sample_format, SampleFormat::I16);
227 }
228
229 #[test]
230 fn cd_quality_config_values() {
231 let cfg = AudioConfig::cd_quality();
232 assert_eq!(cfg.sample_rate, 44100);
233 assert_eq!(cfg.channels, 2);
234 assert_eq!(cfg.sample_format, SampleFormat::I16);
235 }
236
237 #[test]
238 fn high_quality_config_values() {
239 let cfg = AudioConfig::high_quality();
240 assert_eq!(cfg.sample_rate, 48000);
241 assert_eq!(cfg.channels, 2);
242 assert_eq!(cfg.sample_format, SampleFormat::F32);
243 }
244
245 #[test]
246 fn bytes_per_sample_i16() {
247 let cfg = AudioConfig::speech(); assert_eq!(cfg.bytes_per_sample(), 2);
249 }
250
251 #[test]
252 fn bytes_per_sample_f32() {
253 let cfg = AudioConfig::high_quality(); assert_eq!(cfg.bytes_per_sample(), 4);
255 }
256
257 #[test]
258 fn bytes_per_frame_mono_i16() {
259 let cfg = AudioConfig::speech(); assert_eq!(cfg.bytes_per_frame(), 2); }
262
263 #[test]
264 fn bytes_per_frame_stereo_f32() {
265 let cfg = AudioConfig::high_quality(); assert_eq!(cfg.bytes_per_frame(), 8); }
268
269 #[test]
270 fn audio_buffer_new_is_empty() {
271 let buf = AudioBuffer::new(AudioConfig::speech());
272 assert!(buf.is_empty());
273 assert_eq!(buf.num_frames(), 0);
274 }
275
276 #[test]
277 fn audio_buffer_from_pcm_stores_data() {
278 let data = vec![0u8; 64];
279 let cfg = AudioConfig::speech();
280 let buf = AudioBuffer::from_pcm(data.clone(), cfg);
281 assert_eq!(buf.data, data);
282 assert_eq!(buf.config.sample_rate, 16000);
283 assert!(!buf.is_empty());
284 }
285
286 #[test]
287 fn audio_buffer_num_frames() {
288 let buf = AudioBuffer::from_pcm(vec![0u8; 100], AudioConfig::speech());
291 assert_eq!(buf.num_frames(), 50);
292 }
293
294 #[test]
295 fn audio_buffer_duration_secs() {
296 let buf = AudioBuffer::from_pcm(vec![0u8; 32000], AudioConfig::speech());
299 assert!((buf.duration_secs() - 1.0).abs() < f64::EPSILON);
300 }
301
302 #[test]
303 fn voice_new_sets_id_and_defaults() {
304 let v = Voice::new("shimmer");
305 assert_eq!(v.id, "shimmer");
306 assert!(v.name.is_none());
307 assert!(v.language.is_none());
308 }
309
310 #[test]
311 fn output_format_debug_is_reasonable() {
312 let dbg = format!("{:?}", OutputFormat::Wav);
313 assert_eq!(dbg, "Wav");
314 let dbg_mp3 = format!("{:?}", OutputFormat::Mp3);
315 assert_eq!(dbg_mp3, "Mp3");
316 }
317}