speech_prep/chunker/mod.rs
1//! Audio chunking aligned to VAD segments.
2//!
3//! This module segments standardized PCM audio into speech-aligned chunks with
4//! timing and overlap metadata.
5//!
6//! # Architecture
7//!
8//! The chunker follows a streaming-first design:
9//! 1. Accept VAD boundaries (`SpeechChunk`) + raw PCM samples
10//! 2. Generate fixed-duration chunks (default 500ms) aligned to speech
11//! boundaries
12//! 3. Attach temporal metadata (`AudioTimestamp`) for deterministic testing
13//! 4. Attach quality metrics such as energy and speech ratio
14//!
15//! # Performance Contracts
16//!
17//! - **Latency**: <15ms total processing per chunk
18//! - **Alignment**: ±20ms accuracy to VAD boundaries
19//! - **Coverage**: Chunks cover 100% of input duration (no gaps)
20//!
21//! # Example
22//!
23//! ```rust
24//! use speech_prep::{Chunker, ChunkerConfig, SpeechChunk};
25//! use speech_prep::time::{AudioDuration, AudioTimestamp};
26//!
27//! let config = ChunkerConfig::default(); // 500ms chunks
28//! let chunker = Chunker::new(config);
29//!
30//! let audio: Vec<f32> = vec![0.0; 16000]; // 1 second @ 16kHz
31//! let vad_segments = vec![SpeechChunk {
32//! start_time: AudioTimestamp::EPOCH,
33//! end_time: AudioTimestamp::EPOCH
34//! .add_duration(AudioDuration::from_secs(1)),
35//! confidence: 0.9,
36//! avg_energy: 0.5,
37//! frame_count: 50,
38//! }];
39//!
40//! let chunks = chunker.chunk(&audio, 16000, &vad_segments)?;
41//! assert_eq!(chunks.len(), 2); // Two 500ms chunks from 1s speech
42//!
43//! // Overlaps are automatically added between chunks
44//! assert!(chunks[0].overlap_next.is_some()); // First chunk has overlap for next
45//! assert!(chunks[1].overlap_prev.is_some()); // Second chunk has overlap from prev
46//! # Ok::<(), speech_prep::error::Error>(())
47//! ```
48
49use crate::error::{Error, Result};
50use crate::time::AudioTimestamp;
51use std::time::{Duration, Instant};
52
53use crate::SpeechChunk;
54
55mod analysis;
56mod config;
57mod overlap;
58mod planner;
59mod segments;
60mod types;
61
62pub use config::ChunkerConfig;
63use overlap::apply_overlaps;
64pub use types::{ChunkBoundary, ProcessedChunk};
65
66/// Audio chunker for segmenting streams into processing units.
67///
68/// Combines VAD boundaries with duration heuristics to produce processing-size
69/// chunks.
70#[derive(Debug, Clone, Copy)]
71pub struct Chunker {
72 config: ChunkerConfig,
73}
74
75#[allow(clippy::multiple_inherent_impl)]
76impl Chunker {
77 /// Create a new chunker with the given configuration.
78 #[must_use]
79 pub fn new(config: ChunkerConfig) -> Self {
80 Self { config }
81 }
82
83 /// Create a chunker with default configuration (500ms chunks).
84 ///
85 /// Alias for `Chunker::new(ChunkerConfig::default())`.
86 #[must_use]
87 #[allow(clippy::should_implement_trait)]
88 pub fn default() -> Self {
89 Self::new(ChunkerConfig::default())
90 }
91
92 /// Segment audio into processing chunks aligned to VAD boundaries.
93 ///
94 /// This variant assumes the VAD timestamps use the zero-based origin from
95 /// `AudioTimestamp::EPOCH`. If the timestamps already include a stream
96 /// offset, prefer [`Chunker::chunk_with_stream_start`].
97 ///
98 /// # Arguments
99 ///
100 /// - `audio`: Raw PCM samples (f32, normalized to [-1.0, 1.0])
101 /// - `sample_rate`: Audio sample rate in Hz (must be > 0)
102 /// - `vad_segments`: Speech boundaries from VAD analysis
103 ///
104 /// # Returns
105 ///
106 /// Vector of `ProcessedChunk` covering the entire input duration with no
107 /// gaps.
108 ///
109 /// # Errors
110 ///
111 /// Returns `Error::InvalidInput` if:
112 /// - `sample_rate` is zero
113 /// - `audio` is empty
114 /// - VAD segments have invalid timestamps (end < start)
115 ///
116 /// # Performance
117 ///
118 /// Target: <15ms total processing time per chunk generated.
119 pub fn chunk(
120 &self,
121 audio: &[f32],
122 sample_rate: u32,
123 vad_segments: &[SpeechChunk],
124 ) -> Result<Vec<ProcessedChunk>> {
125 self.chunk_with_stream_start(audio, sample_rate, vad_segments, AudioTimestamp::EPOCH)
126 }
127
128 /// Segment audio into processing chunks with an explicit stream start time.
129 ///
130 /// Use this variant when the VAD timestamps should be interpreted relative
131 /// to a known stream start rather than zero-based.
132 ///
133 /// ```
134 /// use speech_prep::{Chunker, ChunkerConfig, SpeechChunk};
135 /// use speech_prep::time::{AudioDuration, AudioTimestamp};
136 ///
137 /// # fn main() -> speech_prep::error::Result<()> {
138 /// let chunker = Chunker::new(ChunkerConfig::streaming());
139 /// let stream_start = AudioTimestamp::EPOCH;
140 ///
141 /// // VAD emits wall-clock timestamps relative to the live stream
142 /// let segments = vec![SpeechChunk {
143 /// start_time: stream_start,
144 /// end_time: stream_start.add_duration(AudioDuration::from_millis(240)),
145 /// confidence: 0.92,
146 /// avg_energy: 0.4,
147 /// frame_count: 48,
148 /// }];
149 ///
150 /// let audio = vec![0.0; 3840]; // 240ms @ 16kHz
151 /// let chunks = chunker.chunk_with_stream_start(&audio, 16_000, &segments, stream_start)?;
152 /// assert_eq!(chunks.len(), 1);
153 /// # Ok(())
154 /// # }
155 /// ```
156 pub fn chunk_with_stream_start(
157 &self,
158 audio: &[f32],
159 sample_rate: u32,
160 vad_segments: &[SpeechChunk],
161 stream_start_time: AudioTimestamp,
162 ) -> Result<Vec<ProcessedChunk>> {
163 if sample_rate == 0 {
164 return Err(Error::InvalidInput("sample_rate must be > 0".into()));
165 }
166 if audio.is_empty() {
167 return Err(Error::InvalidInput("audio buffer is empty".into()));
168 }
169
170 for segment in vad_segments {
171 if segment.end_time < segment.start_time {
172 return Err(Error::InvalidInput(
173 "VAD segment has end_time < start_time".into(),
174 ));
175 }
176 }
177
178 let processing_start = Instant::now();
179
180 let total_samples = audio.len();
181 let total_duration_secs = total_samples as f64 / f64::from(sample_rate);
182 let total_duration = Duration::from_secs_f64(total_duration_secs);
183
184 let earliest_segment_start = vad_segments.iter().map(|seg| seg.start_time).min();
185 let audio_start = earliest_segment_start.map_or(stream_start_time, |start| {
186 std::cmp::min(start, stream_start_time)
187 });
188
189 let noise_baseline =
190 Self::compute_noise_baseline(audio, sample_rate, vad_segments, audio_start);
191
192 let estimated_chunks =
193 (total_duration.as_millis() / self.config.target_duration.as_millis()).max(1) as usize
194 + 1;
195 let mut chunks = Vec::with_capacity(estimated_chunks);
196
197 if vad_segments.is_empty() {
198 chunks.push(Self::create_silence_chunk(
199 audio,
200 sample_rate,
201 audio_start,
202 total_duration,
203 audio_start,
204 )?);
205 } else {
206 let mut current_time = audio_start;
207
208 for segment in vad_segments {
209 if segment.start_time > current_time {
210 let silence_end = segment.start_time;
211 let silence_duration =
212 silence_end.duration_since(current_time).ok_or_else(|| {
213 Error::Processing("VAD segment start_time < current_time".into())
214 })?;
215
216 chunks.push(Self::create_silence_chunk(
217 audio,
218 sample_rate,
219 current_time,
220 silence_duration,
221 audio_start,
222 )?);
223 }
224
225 let segment_chunks = self.process_speech_segment(
226 audio,
227 sample_rate,
228 segment,
229 noise_baseline,
230 audio_start,
231 )?;
232 chunks.extend(segment_chunks);
233
234 current_time = segment.end_time;
235 }
236
237 let total_end_time = audio_start.add_duration(total_duration);
238 if total_end_time > current_time {
239 let trailing_duration = total_end_time
240 .duration_since(current_time)
241 .ok_or_else(|| Error::Processing("total_end_time < current_time".into()))?;
242 chunks.push(Self::create_silence_chunk(
243 audio,
244 sample_rate,
245 current_time,
246 trailing_duration,
247 audio_start,
248 )?);
249 }
250 }
251
252 let overlap_samples = Self::duration_to_samples(self.config.overlap_duration, sample_rate);
253 apply_overlaps(&mut chunks, overlap_samples, sample_rate);
254
255 let latency = processing_start.elapsed();
256 let chunk_count = chunks.len().max(1);
257 let _per_chunk = Duration::from_secs_f64(latency.as_secs_f64() / chunk_count as f64);
258 for _ in 0..chunk_count {}
259
260 Ok(chunks)
261 }
262}
263
264#[cfg(test)]
265mod tests;