Skip to main content

gemini_live/
audio.rs

1//! Audio encoding utilities for the Gemini Live API.
2//!
3//! # Wire format
4//!
5//! | Direction | Encoding          | Sample rate | MIME                      |
6//! |-----------|-------------------|-------------|---------------------------|
7//! | Input     | 16-bit signed LE PCM | 16 kHz   | `audio/pcm;rate=16000`    |
8//! | Output    | 16-bit signed LE PCM | 24 kHz   | `audio/pcm;rate=24000`    |
9//!
10//! Audio data is base64-encoded inside JSON text frames.
11//!
12//! # Chunk sizing
13//!
14//! Aim for **100–250 ms** per chunk:
15//! - 16 kHz × 2 bytes × 0.1 s = 3,200 bytes raw → ~4,300 bytes base64
16//! - 16 kHz × 2 bytes × 0.25 s = 8,000 bytes raw → ~10,700 bytes base64
17//!
18//! Chunks smaller than ~20 ms waste bandwidth on WebSocket frame overhead;
19//! chunks larger than ~250 ms add perceptible latency.
20
21use base64::Engine;
22
23/// Input audio MIME type (16-bit LE PCM, 16 kHz).
24pub const INPUT_AUDIO_MIME: &str = "audio/pcm;rate=16000";
25
26/// Output audio MIME type (16-bit LE PCM, 24 kHz).
27pub const OUTPUT_AUDIO_MIME: &str = "audio/pcm;rate=24000";
28
29/// Input audio sample rate in Hz.
30pub const INPUT_SAMPLE_RATE: u32 = 16_000;
31
32/// Output audio sample rate in Hz.
33pub const OUTPUT_SAMPLE_RATE: u32 = 24_000;
34
35/// Zero-allocation audio encoder for the streaming hot path.
36///
37/// Both [`encode_f32`](Self::encode_f32) and [`encode_i16_le`](Self::encode_i16_le)
38/// write into pre-allocated internal buffers and return a borrowed `&str`.
39/// After the first call that establishes capacity, subsequent calls produce
40/// **no heap allocations** — critical for real-time audio where this runs
41/// every 100–250 ms.
42///
43/// # Performance
44///
45/// For maximum throughput, keep a single `AudioEncoder` instance alive across
46/// the streaming loop and pair it with [`Session::send_raw`](crate::session::Session::send_raw)
47/// to avoid the extra allocation in [`Session::send_audio`](crate::session::Session::send_audio).
48///
49/// ```rust,no_run
50/// # use gemini_live::audio::{AudioEncoder, INPUT_AUDIO_MIME};
51/// # use gemini_live::types::*;
52/// # fn example(session: &gemini_live::session::Session, pcm_bytes: &[u8]) {
53/// let mut enc = AudioEncoder::new();
54/// // In a streaming loop:
55/// let b64 = enc.encode_i16_le(pcm_bytes);
56/// // Build the message — only the to_owned() here allocates:
57/// let msg = ClientMessage::RealtimeInput(RealtimeInput {
58///     audio: Some(Blob { data: b64.to_owned(), mime_type: INPUT_AUDIO_MIME.into() }),
59///     video: None, text: None, activity_start: None, activity_end: None,
60///     audio_stream_end: None,
61/// });
62/// # }
63/// ```
64///
65/// # Example
66///
67/// ```
68/// use gemini_live::audio::AudioEncoder;
69///
70/// let mut enc = AudioEncoder::new();
71/// let samples: Vec<f32> = vec![0.0, 0.5, -0.5, 1.0, -1.0];
72/// let base64 = enc.encode_f32(&samples);
73/// assert!(!base64.is_empty());
74/// ```
75pub struct AudioEncoder {
76    pcm_buf: Vec<u8>,
77    b64_buf: String,
78}
79
80impl AudioEncoder {
81    /// Create a new encoder pre-allocated for ~250 ms at 16 kHz.
82    pub fn new() -> Self {
83        Self {
84            // 250 ms × 16 kHz × 2 bytes = 8,000 bytes
85            pcm_buf: Vec::with_capacity(8_000),
86            // base64 expands by ~4/3
87            b64_buf: String::with_capacity(11_000),
88        }
89    }
90
91    /// Encode `f32` samples (range `[-1.0, 1.0]`) to base64 i16-LE PCM.
92    ///
93    /// Values outside `[-1.0, 1.0]` are clamped.  The returned `&str` borrows
94    /// the encoder's internal buffer and is valid until the next `encode_*`
95    /// call.
96    pub fn encode_f32(&mut self, samples: &[f32]) -> &str {
97        self.pcm_buf.clear();
98        for &s in samples {
99            let clamped = s.clamp(-1.0, 1.0);
100            let i16_val = (clamped * 32767.0) as i16;
101            self.pcm_buf.extend_from_slice(&i16_val.to_le_bytes());
102        }
103        self.b64_buf.clear();
104        base64::engine::general_purpose::STANDARD.encode_string(&self.pcm_buf, &mut self.b64_buf);
105        &self.b64_buf
106    }
107
108    /// Encode raw i16 little-endian PCM bytes to base64 (zero-conversion path).
109    ///
110    /// Use this when the audio source already provides i16-LE samples.
111    pub fn encode_i16_le(&mut self, pcm: &[u8]) -> &str {
112        self.b64_buf.clear();
113        base64::engine::general_purpose::STANDARD.encode_string(pcm, &mut self.b64_buf);
114        &self.b64_buf
115    }
116}
117
118impl Default for AudioEncoder {
119    fn default() -> Self {
120        Self::new()
121    }
122}
123
124#[cfg(test)]
125mod tests {
126    use super::*;
127
128    #[test]
129    fn encode_f32_silence() {
130        let mut enc = AudioEncoder::new();
131        let samples = vec![0.0f32; 100];
132        let b64 = enc.encode_f32(&samples);
133        let decoded = base64::engine::general_purpose::STANDARD
134            .decode(b64)
135            .unwrap();
136        // 100 samples × 2 bytes each
137        assert_eq!(decoded.len(), 200);
138        // All zeros → silence
139        assert!(decoded.iter().all(|&b| b == 0));
140    }
141
142    #[test]
143    fn encode_f32_boundary_values() {
144        let mut enc = AudioEncoder::new();
145        let samples = vec![1.0f32, -1.0, 0.5, -0.5];
146        let b64 = enc.encode_f32(&samples);
147        let decoded = base64::engine::general_purpose::STANDARD
148            .decode(b64)
149            .unwrap();
150        assert_eq!(decoded.len(), 8);
151
152        // Parse back as i16-LE
153        let s0 = i16::from_le_bytes([decoded[0], decoded[1]]);
154        let s1 = i16::from_le_bytes([decoded[2], decoded[3]]);
155        let s2 = i16::from_le_bytes([decoded[4], decoded[5]]);
156        let s3 = i16::from_le_bytes([decoded[6], decoded[7]]);
157
158        assert_eq!(s0, 32767); //  1.0 → i16::MAX
159        assert_eq!(s1, -32767); // -1.0 → clamped
160        assert_eq!(s2, 16383); //  0.5
161        assert_eq!(s3, -16383); // -0.5
162    }
163
164    #[test]
165    fn encode_f32_clamps_overflow() {
166        let mut enc = AudioEncoder::new();
167        let samples = vec![2.0f32, -2.0];
168        let b64 = enc.encode_f32(&samples);
169        let decoded = base64::engine::general_purpose::STANDARD
170            .decode(b64)
171            .unwrap();
172        let s0 = i16::from_le_bytes([decoded[0], decoded[1]]);
173        let s1 = i16::from_le_bytes([decoded[2], decoded[3]]);
174        assert_eq!(s0, 32767);
175        assert_eq!(s1, -32767);
176    }
177
178    #[test]
179    fn encode_i16_le_roundtrip() {
180        let mut enc = AudioEncoder::new();
181        let raw_pcm: Vec<u8> = vec![0x01, 0x02, 0x03, 0x04];
182        let b64 = enc.encode_i16_le(&raw_pcm);
183        let decoded = base64::engine::general_purpose::STANDARD
184            .decode(b64)
185            .unwrap();
186        assert_eq!(decoded, raw_pcm);
187    }
188
189    #[test]
190    fn encoder_reuses_buffer() {
191        let mut enc = AudioEncoder::new();
192        let _ = enc.encode_f32(&[0.0; 1000]);
193        let cap_after_first = enc.b64_buf.capacity();
194        let _ = enc.encode_f32(&[0.0; 100]);
195        // Capacity should not shrink (buffer reuse)
196        assert!(enc.b64_buf.capacity() >= cap_after_first);
197    }
198}