use serde::{Deserialize, Serialize};
pub(crate) const SAMPLE_RATE_SPEECH: u32 = 16_000;
pub(crate) const SAMPLE_RATE_CD: u32 = 44_100;
pub(crate) const SAMPLE_RATE_HIGH_QUALITY: u32 = 48_000;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum SampleFormat {
I16,
F32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AudioConfig {
pub sample_rate: u32,
pub channels: u16,
pub sample_format: SampleFormat,
}
impl AudioConfig {
pub fn speech() -> Self {
Self {
sample_rate: SAMPLE_RATE_SPEECH,
channels: 1,
sample_format: SampleFormat::I16,
}
}
pub fn cd_quality() -> Self {
Self {
sample_rate: SAMPLE_RATE_CD,
channels: 2,
sample_format: SampleFormat::I16,
}
}
pub fn high_quality() -> Self {
Self {
sample_rate: SAMPLE_RATE_HIGH_QUALITY,
channels: 2,
sample_format: SampleFormat::F32,
}
}
pub fn bytes_per_sample(&self) -> usize {
match self.sample_format {
SampleFormat::I16 => 2,
SampleFormat::F32 => 4,
}
}
pub fn bytes_per_frame(&self) -> usize {
self.bytes_per_sample() * self.channels as usize
}
}
impl Default for AudioConfig {
fn default() -> Self {
Self::speech()
}
}
#[derive(Debug, Clone)]
pub struct AudioBuffer {
pub data: Vec<u8>,
pub config: AudioConfig,
}
impl AudioBuffer {
pub fn new(config: AudioConfig) -> Self {
Self {
data: Vec::new(),
config,
}
}
pub fn from_pcm(data: Vec<u8>, config: AudioConfig) -> Self {
Self { data, config }
}
pub fn duration_secs(&self) -> f64 {
let frame_size = self.config.bytes_per_frame();
if frame_size == 0 {
return 0.0;
}
let num_frames = self.data.len() / frame_size;
num_frames as f64 / self.config.sample_rate as f64
}
pub fn num_frames(&self) -> usize {
let frame_size = self.config.bytes_per_frame();
if frame_size == 0 {
0
} else {
self.data.len() / frame_size
}
}
pub fn is_empty(&self) -> bool {
self.data.is_empty()
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Voice {
pub id: String,
pub name: Option<String>,
pub language: Option<String>,
}
impl Voice {
pub fn new(id: impl Into<String>) -> Self {
Self {
id: id.into(),
name: None,
language: None,
}
}
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub enum OutputFormat {
Wav,
Mp3,
Pcm,
Opus,
Flac,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TtsOptions {
pub voice: Voice,
pub speed: Option<f32>,
pub output_format: OutputFormat,
}
impl Default for TtsOptions {
fn default() -> Self {
Self {
voice: Voice::new("alloy"),
speed: None,
output_format: OutputFormat::Wav,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct SttOptions {
pub language: Option<String>,
pub timestamps: bool,
pub prompt: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Transcript {
pub text: String,
pub language: Option<String>,
pub duration_secs: Option<f64>,
pub segments: Vec<TranscriptSegment>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TranscriptSegment {
pub text: String,
pub start: f64,
pub end: f64,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn speech_config_values() {
let cfg = AudioConfig::speech();
assert_eq!(cfg.sample_rate, 16000);
assert_eq!(cfg.channels, 1);
assert_eq!(cfg.sample_format, SampleFormat::I16);
}
#[test]
fn cd_quality_config_values() {
let cfg = AudioConfig::cd_quality();
assert_eq!(cfg.sample_rate, 44100);
assert_eq!(cfg.channels, 2);
assert_eq!(cfg.sample_format, SampleFormat::I16);
}
#[test]
fn high_quality_config_values() {
let cfg = AudioConfig::high_quality();
assert_eq!(cfg.sample_rate, 48000);
assert_eq!(cfg.channels, 2);
assert_eq!(cfg.sample_format, SampleFormat::F32);
}
#[test]
fn bytes_per_sample_i16() {
let cfg = AudioConfig::speech(); assert_eq!(cfg.bytes_per_sample(), 2);
}
#[test]
fn bytes_per_sample_f32() {
let cfg = AudioConfig::high_quality(); assert_eq!(cfg.bytes_per_sample(), 4);
}
#[test]
fn bytes_per_frame_mono_i16() {
let cfg = AudioConfig::speech(); assert_eq!(cfg.bytes_per_frame(), 2); }
#[test]
fn bytes_per_frame_stereo_f32() {
let cfg = AudioConfig::high_quality(); assert_eq!(cfg.bytes_per_frame(), 8); }
#[test]
fn audio_buffer_new_is_empty() {
let buf = AudioBuffer::new(AudioConfig::speech());
assert!(buf.is_empty());
assert_eq!(buf.num_frames(), 0);
}
#[test]
fn audio_buffer_from_pcm_stores_data() {
let data = vec![0u8; 64];
let cfg = AudioConfig::speech();
let buf = AudioBuffer::from_pcm(data.clone(), cfg);
assert_eq!(buf.data, data);
assert_eq!(buf.config.sample_rate, 16000);
assert!(!buf.is_empty());
}
#[test]
fn audio_buffer_num_frames() {
let buf = AudioBuffer::from_pcm(vec![0u8; 100], AudioConfig::speech());
assert_eq!(buf.num_frames(), 50);
}
#[test]
fn audio_buffer_duration_secs() {
let buf = AudioBuffer::from_pcm(vec![0u8; 32000], AudioConfig::speech());
assert!((buf.duration_secs() - 1.0).abs() < f64::EPSILON);
}
#[test]
fn voice_new_sets_id_and_defaults() {
let v = Voice::new("shimmer");
assert_eq!(v.id, "shimmer");
assert!(v.name.is_none());
assert!(v.language.is_none());
}
#[test]
fn output_format_debug_is_reasonable() {
let dbg = format!("{:?}", OutputFormat::Wav);
assert_eq!(dbg, "Wav");
let dbg_mp3 = format!("{:?}", OutputFormat::Mp3);
assert_eq!(dbg_mp3, "Mp3");
}
}