Skip to main content

atomr_agents_stt_core/
audio.rs

1//! Audio input shape. The trait is decode-agnostic — `AudioInput`
2//! carries either a path, a byte buffer (with a declared format),
3//! already-decoded PCM, or an async reader. `atomr-agents-stt-audio`
4//! provides the actual symphonia-based decoder used by local backends.
5
6use std::path::PathBuf;
7
8use bytes::Bytes;
9use serde::{Deserialize, Serialize};
10
11#[derive(Debug, Clone)]
12pub enum AudioInput {
13    /// File on disk. Format is inferred from the extension by
14    /// `stt-audio::decode`, or supplied alongside via the cloud
15    /// backend's multipart filename header.
16    File(PathBuf),
17    /// In-memory buffer with an explicit format hint.
18    Bytes { data: Bytes, format: AudioFormat },
19    /// Already-decoded PCM samples. Backends that do their own
20    /// decode (whisper-rs) avoid the symphonia round-trip.
21    Pcm(PcmBuffer),
22}
23
24#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
25#[serde(tag = "kind", rename_all = "snake_case")]
26pub enum AudioFormat {
27    Pcm {
28        sample_rate: u32,
29        channels: u16,
30        sample: SampleType,
31    },
32    Wav,
33    Mp3,
34    Flac,
35    Ogg,
36    Opus,
37    Webm,
38    Mp4,
39    Aac,
40    /// 8 kHz µ-law (telephony).
41    Mulaw {
42        sample_rate: u32,
43    },
44}
45
46impl AudioFormat {
47    /// Conventional MIME type for HTTP uploads.
48    pub fn mime(&self) -> &'static str {
49        match self {
50            AudioFormat::Pcm { .. } => "audio/wav",
51            AudioFormat::Wav => "audio/wav",
52            AudioFormat::Mp3 => "audio/mpeg",
53            AudioFormat::Flac => "audio/flac",
54            AudioFormat::Ogg => "audio/ogg",
55            AudioFormat::Opus => "audio/opus",
56            AudioFormat::Webm => "audio/webm",
57            AudioFormat::Mp4 => "audio/mp4",
58            AudioFormat::Aac => "audio/aac",
59            AudioFormat::Mulaw { .. } => "audio/basic",
60        }
61    }
62
63    /// Conventional file extension (no leading dot).
64    pub fn extension(&self) -> &'static str {
65        match self {
66            AudioFormat::Pcm { .. } | AudioFormat::Wav => "wav",
67            AudioFormat::Mp3 => "mp3",
68            AudioFormat::Flac => "flac",
69            AudioFormat::Ogg => "ogg",
70            AudioFormat::Opus => "opus",
71            AudioFormat::Webm => "webm",
72            AudioFormat::Mp4 => "mp4",
73            AudioFormat::Aac => "aac",
74            AudioFormat::Mulaw { .. } => "raw",
75        }
76    }
77}
78
79#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
80#[serde(rename_all = "snake_case")]
81pub enum SampleType {
82    I16,
83    I32,
84    F32,
85}
86
87/// Decoded PCM. Backends that resample (e.g. whisper-rs needs 16 kHz
88/// mono f32) take this and convert.
89#[derive(Debug, Clone)]
90pub struct PcmBuffer {
91    pub samples: Vec<f32>,
92    pub sample_rate: u32,
93    pub channels: u16,
94}
95
96impl PcmBuffer {
97    pub fn new(samples: Vec<f32>, sample_rate: u32, channels: u16) -> Self {
98        Self {
99            samples,
100            sample_rate,
101            channels,
102        }
103    }
104
105    pub fn duration_secs(&self) -> f32 {
106        if self.sample_rate == 0 || self.channels == 0 {
107            0.0
108        } else {
109            (self.samples.len() as f32) / (self.sample_rate as f32 * self.channels as f32)
110        }
111    }
112}