gemini_live/audio.rs
1//! Audio encoding utilities for the Gemini Live API.
2//!
3//! # Wire format
4//!
5//! | Direction | Encoding | Sample rate | MIME |
6//! |-----------|-------------------|-------------|---------------------------|
7//! | Input | 16-bit signed LE PCM | 16 kHz | `audio/pcm;rate=16000` |
8//! | Output | 16-bit signed LE PCM | 24 kHz | `audio/pcm;rate=24000` |
9//!
10//! Audio data is base64-encoded inside JSON text frames.
11//!
12//! # Chunk sizing
13//!
14//! Aim for **100–250 ms** per chunk:
15//! - 16 kHz × 2 bytes × 0.1 s = 3,200 bytes raw → ~4,300 bytes base64
16//! - 16 kHz × 2 bytes × 0.25 s = 8,000 bytes raw → ~10,700 bytes base64
17//!
18//! Chunks smaller than ~20 ms waste bandwidth on WebSocket frame overhead;
19//! chunks larger than ~250 ms add perceptible latency.
20
21use base64::Engine;
22
23/// Input audio MIME type (16-bit LE PCM, 16 kHz).
24pub const INPUT_AUDIO_MIME: &str = "audio/pcm;rate=16000";
25
26/// Output audio MIME type (16-bit LE PCM, 24 kHz).
27pub const OUTPUT_AUDIO_MIME: &str = "audio/pcm;rate=24000";
28
29/// Input audio sample rate in Hz.
30pub const INPUT_SAMPLE_RATE: u32 = 16_000;
31
32/// Output audio sample rate in Hz.
33pub const OUTPUT_SAMPLE_RATE: u32 = 24_000;
34
35/// Zero-allocation audio encoder for the streaming hot path.
36///
37/// Both [`encode_f32`](Self::encode_f32) and [`encode_i16_le`](Self::encode_i16_le)
38/// write into pre-allocated internal buffers and return a borrowed `&str`.
39/// After the first call that establishes capacity, subsequent calls produce
40/// **no heap allocations** — critical for real-time audio where this runs
41/// every 100–250 ms.
42///
43/// # Performance
44///
45/// For maximum throughput, keep a single `AudioEncoder` instance alive across
46/// the streaming loop and pair it with [`Session::send_raw`](crate::session::Session::send_raw)
47/// to avoid the extra allocation in [`Session::send_audio`](crate::session::Session::send_audio).
48///
49/// ```rust,no_run
50/// # use gemini_live::audio::{AudioEncoder, INPUT_AUDIO_MIME};
51/// # use gemini_live::types::*;
52/// # fn example(session: &gemini_live::session::Session, pcm_bytes: &[u8]) {
53/// let mut enc = AudioEncoder::new();
54/// // In a streaming loop:
55/// let b64 = enc.encode_i16_le(pcm_bytes);
56/// // Build the message — only the to_owned() here allocates:
57/// let msg = ClientMessage::RealtimeInput(RealtimeInput {
58/// audio: Some(Blob { data: b64.to_owned(), mime_type: INPUT_AUDIO_MIME.into() }),
59/// video: None, text: None, activity_start: None, activity_end: None,
60/// audio_stream_end: None,
61/// });
62/// # }
63/// ```
64///
65/// # Example
66///
67/// ```
68/// use gemini_live::audio::AudioEncoder;
69///
70/// let mut enc = AudioEncoder::new();
71/// let samples: Vec<f32> = vec![0.0, 0.5, -0.5, 1.0, -1.0];
72/// let base64 = enc.encode_f32(&samples);
73/// assert!(!base64.is_empty());
74/// ```
75pub struct AudioEncoder {
76 pcm_buf: Vec<u8>,
77 b64_buf: String,
78}
79
80impl AudioEncoder {
81 /// Create a new encoder pre-allocated for ~250 ms at 16 kHz.
82 pub fn new() -> Self {
83 Self {
84 // 250 ms × 16 kHz × 2 bytes = 8,000 bytes
85 pcm_buf: Vec::with_capacity(8_000),
86 // base64 expands by ~4/3
87 b64_buf: String::with_capacity(11_000),
88 }
89 }
90
91 /// Encode `f32` samples (range `[-1.0, 1.0]`) to base64 i16-LE PCM.
92 ///
93 /// Values outside `[-1.0, 1.0]` are clamped. The returned `&str` borrows
94 /// the encoder's internal buffer and is valid until the next `encode_*`
95 /// call.
96 pub fn encode_f32(&mut self, samples: &[f32]) -> &str {
97 self.pcm_buf.clear();
98 for &s in samples {
99 let clamped = s.clamp(-1.0, 1.0);
100 let i16_val = (clamped * 32767.0) as i16;
101 self.pcm_buf.extend_from_slice(&i16_val.to_le_bytes());
102 }
103 self.b64_buf.clear();
104 base64::engine::general_purpose::STANDARD.encode_string(&self.pcm_buf, &mut self.b64_buf);
105 &self.b64_buf
106 }
107
108 /// Encode raw i16 little-endian PCM bytes to base64 (zero-conversion path).
109 ///
110 /// Use this when the audio source already provides i16-LE samples.
111 pub fn encode_i16_le(&mut self, pcm: &[u8]) -> &str {
112 self.b64_buf.clear();
113 base64::engine::general_purpose::STANDARD.encode_string(pcm, &mut self.b64_buf);
114 &self.b64_buf
115 }
116}
117
118impl Default for AudioEncoder {
119 fn default() -> Self {
120 Self::new()
121 }
122}
123
124#[cfg(test)]
125mod tests {
126 use super::*;
127
128 #[test]
129 fn encode_f32_silence() {
130 let mut enc = AudioEncoder::new();
131 let samples = vec![0.0f32; 100];
132 let b64 = enc.encode_f32(&samples);
133 let decoded = base64::engine::general_purpose::STANDARD
134 .decode(b64)
135 .unwrap();
136 // 100 samples × 2 bytes each
137 assert_eq!(decoded.len(), 200);
138 // All zeros → silence
139 assert!(decoded.iter().all(|&b| b == 0));
140 }
141
142 #[test]
143 fn encode_f32_boundary_values() {
144 let mut enc = AudioEncoder::new();
145 let samples = vec![1.0f32, -1.0, 0.5, -0.5];
146 let b64 = enc.encode_f32(&samples);
147 let decoded = base64::engine::general_purpose::STANDARD
148 .decode(b64)
149 .unwrap();
150 assert_eq!(decoded.len(), 8);
151
152 // Parse back as i16-LE
153 let s0 = i16::from_le_bytes([decoded[0], decoded[1]]);
154 let s1 = i16::from_le_bytes([decoded[2], decoded[3]]);
155 let s2 = i16::from_le_bytes([decoded[4], decoded[5]]);
156 let s3 = i16::from_le_bytes([decoded[6], decoded[7]]);
157
158 assert_eq!(s0, 32767); // 1.0 → i16::MAX
159 assert_eq!(s1, -32767); // -1.0 → clamped
160 assert_eq!(s2, 16383); // 0.5
161 assert_eq!(s3, -16383); // -0.5
162 }
163
164 #[test]
165 fn encode_f32_clamps_overflow() {
166 let mut enc = AudioEncoder::new();
167 let samples = vec![2.0f32, -2.0];
168 let b64 = enc.encode_f32(&samples);
169 let decoded = base64::engine::general_purpose::STANDARD
170 .decode(b64)
171 .unwrap();
172 let s0 = i16::from_le_bytes([decoded[0], decoded[1]]);
173 let s1 = i16::from_le_bytes([decoded[2], decoded[3]]);
174 assert_eq!(s0, 32767);
175 assert_eq!(s1, -32767);
176 }
177
178 #[test]
179 fn encode_i16_le_roundtrip() {
180 let mut enc = AudioEncoder::new();
181 let raw_pcm: Vec<u8> = vec![0x01, 0x02, 0x03, 0x04];
182 let b64 = enc.encode_i16_le(&raw_pcm);
183 let decoded = base64::engine::general_purpose::STANDARD
184 .decode(b64)
185 .unwrap();
186 assert_eq!(decoded, raw_pcm);
187 }
188
189 #[test]
190 fn encoder_reuses_buffer() {
191 let mut enc = AudioEncoder::new();
192 let _ = enc.encode_f32(&[0.0; 1000]);
193 let cap_after_first = enc.b64_buf.capacity();
194 let _ = enc.encode_f32(&[0.0; 100]);
195 // Capacity should not shrink (buffer reuse)
196 assert!(enc.b64_buf.capacity() >= cap_after_first);
197 }
198}