speech_prep/chunker/mod.rs
1//! Audio chunking pipeline for multi-consumer processing.
2//!
3//! This module segments standardized PCM audio into speech-aligned chunks with
4//! rich metadata, enabling parallel downstream processing by speech
5//! recognition, scoring, and alignment pipelines.
6//!
7//! # Architecture
8//!
9//! The chunker follows a streaming-first design:
10//! 1. Accept VAD boundaries (`SpeechChunk`) + raw PCM samples
11//! 2. Generate fixed-duration chunks (default 500ms) aligned to speech
12//! boundaries
13//! 3. Attach temporal metadata (`AudioTimestamp`) for deterministic testing
14//! 4. Provide quality metrics (energy, speech ratio) for adaptive routing
15//!
16//! # Performance Contracts
17//!
18//! - **Latency**: <15ms total processing per chunk
19//! - **Alignment**: ±20ms accuracy to VAD boundaries
20//! - **Coverage**: Chunks cover 100% of input duration (no gaps)
21//!
22//! # Example
23//!
24//! ```rust
25//! use speech_prep::{Chunker, ChunkerConfig, SpeechChunk};
26//! use speech_prep::time::{AudioDuration, AudioTimestamp};
27//!
28//! let config = ChunkerConfig::default(); // 500ms chunks
29//! let chunker = Chunker::new(config);
30//!
31//! let audio: Vec<f32> = vec![0.0; 16000]; // 1 second @ 16kHz
32//! let vad_segments = vec![SpeechChunk {
33//! start_time: AudioTimestamp::EPOCH,
34//! end_time: AudioTimestamp::EPOCH
35//! .add_duration(AudioDuration::from_secs(1)),
36//! confidence: 0.9,
37//! avg_energy: 0.5,
38//! frame_count: 50,
39//! }];
40//!
41//! let chunks = chunker.chunk(&audio, 16000, &vad_segments)?;
42//! assert_eq!(chunks.len(), 2); // Two 500ms chunks from 1s speech
43//!
44//! // Overlaps are automatically added between chunks
45//! assert!(chunks[0].overlap_next.is_some()); // First chunk has overlap for next
46//! assert!(chunks[1].overlap_prev.is_some()); // Second chunk has overlap from prev
47//! # Ok::<(), speech_prep::error::Error>(())
48//! ```
49
50use crate::error::{Error, Result};
51use crate::time::AudioTimestamp;
52use std::time::{Duration, Instant};
53
54use crate::SpeechChunk;
55
56mod analysis;
57mod config;
58mod overlap;
59mod planner;
60mod segments;
61mod types;
62
63pub use config::ChunkerConfig;
64use overlap::apply_overlaps;
65pub use types::{ChunkBoundary, ProcessedChunk};
66
67/// Audio chunker for segmenting streams into processing units.
68///
69/// Combines VAD boundaries with duration heuristics to produce chunks optimized
70/// for downstream processing by downstream consumers.
71#[derive(Debug, Clone, Copy)]
72pub struct Chunker {
73 config: ChunkerConfig,
74}
75
76#[allow(clippy::multiple_inherent_impl)]
77impl Chunker {
78 /// Create a new chunker with the given configuration.
79 #[must_use]
80 pub fn new(config: ChunkerConfig) -> Self {
81 Self { config }
82 }
83
84 /// Create a chunker with default configuration (500ms chunks).
85 ///
86 /// Alias for `Chunker::new(ChunkerConfig::default())`.
87 #[must_use]
88 #[allow(clippy::should_implement_trait)]
89 pub fn default() -> Self {
90 Self::new(ChunkerConfig::default())
91 }
92
93 /// Segment audio into processing chunks aligned to VAD boundaries.
94 ///
95 /// This variant assumes that VAD timestamps are relative to the Unix epoch
96 /// (e.g., tests that build times off `AudioTimestamp::EPOCH`). For streaming
97 /// scenarios where VAD emits wall-clock timestamps (`AudioTimestamp::now()`),
98 /// prefer [`Chunker::chunk_with_stream_start`] so the chunker can normalize
99 /// against the actual stream start.
100 ///
101 /// # Arguments
102 ///
103 /// - `audio`: Raw PCM samples (f32, normalized to [-1.0, 1.0])
104 /// - `sample_rate`: Audio sample rate in Hz (must be > 0)
105 /// - `vad_segments`: Speech boundaries from VAD analysis
106 ///
107 /// # Returns
108 ///
109 /// Vector of `ProcessedChunk` covering the entire input duration with no
110 /// gaps.
111 ///
112 /// # Errors
113 ///
114 /// Returns `Error::InvalidInput` if:
115 /// - `sample_rate` is zero
116 /// - `audio` is empty
117 /// - VAD segments have invalid timestamps (end < start)
118 ///
119 /// # Performance
120 ///
121 /// Target: <15ms total processing time per chunk generated.
122 pub fn chunk(
123 &self,
124 audio: &[f32],
125 sample_rate: u32,
126 vad_segments: &[SpeechChunk],
127 ) -> Result<Vec<ProcessedChunk>> {
128 self.chunk_with_stream_start(audio, sample_rate, vad_segments, AudioTimestamp::EPOCH)
129 }
130
131 /// Segment audio into processing chunks with an explicit stream start time.
132 ///
133 /// Use this variant when the VAD timestamps are absolute (e.g., wall-clock)
134 /// rather than relative to the Unix epoch.
135 ///
136 /// ```
137 /// use speech_prep::{Chunker, ChunkerConfig, SpeechChunk};
138 /// use speech_prep::time::{AudioDuration, AudioTimestamp};
139 ///
140 /// # fn main() -> speech_prep::error::Result<()> {
141 /// let chunker = Chunker::new(ChunkerConfig::streaming());
142 /// let stream_start = AudioTimestamp::EPOCH;
143 ///
144 /// // VAD emits wall-clock timestamps relative to the live stream
145 /// let segments = vec![SpeechChunk {
146 /// start_time: stream_start,
147 /// end_time: stream_start.add_duration(AudioDuration::from_millis(240)),
148 /// confidence: 0.92,
149 /// avg_energy: 0.4,
150 /// frame_count: 48,
151 /// }];
152 ///
153 /// let audio = vec![0.0; 3840]; // 240ms @ 16kHz
154 /// let chunks = chunker.chunk_with_stream_start(&audio, 16_000, &segments, stream_start)?;
155 /// assert_eq!(chunks.len(), 1);
156 /// # Ok(())
157 /// # }
158 /// ```
159 pub fn chunk_with_stream_start(
160 &self,
161 audio: &[f32],
162 sample_rate: u32,
163 vad_segments: &[SpeechChunk],
164 stream_start_time: AudioTimestamp,
165 ) -> Result<Vec<ProcessedChunk>> {
166 if sample_rate == 0 {
167 return Err(Error::InvalidInput("sample_rate must be > 0".into()));
168 }
169 if audio.is_empty() {
170 return Err(Error::InvalidInput("audio buffer is empty".into()));
171 }
172
173 for segment in vad_segments {
174 if segment.end_time < segment.start_time {
175 return Err(Error::InvalidInput(
176 "VAD segment has end_time < start_time".into(),
177 ));
178 }
179 }
180
181 let processing_start = Instant::now();
182
183 let total_samples = audio.len();
184 let total_duration_secs = total_samples as f64 / f64::from(sample_rate);
185 let total_duration = Duration::from_secs_f64(total_duration_secs);
186
187 let earliest_segment_start = vad_segments.iter().map(|seg| seg.start_time).min();
188 let audio_start = earliest_segment_start.map_or(stream_start_time, |start| {
189 std::cmp::min(start, stream_start_time)
190 });
191
192 let noise_baseline =
193 Self::compute_noise_baseline(audio, sample_rate, vad_segments, audio_start);
194
195 let estimated_chunks =
196 (total_duration.as_millis() / self.config.target_duration.as_millis()).max(1) as usize
197 + 1;
198 let mut chunks = Vec::with_capacity(estimated_chunks);
199
200 if vad_segments.is_empty() {
201 chunks.push(Self::create_silence_chunk(
202 audio,
203 sample_rate,
204 audio_start,
205 total_duration,
206 audio_start,
207 )?);
208 } else {
209 let mut current_time = audio_start;
210
211 for segment in vad_segments {
212 if segment.start_time > current_time {
213 let silence_end = segment.start_time;
214 let silence_duration =
215 silence_end.duration_since(current_time).ok_or_else(|| {
216 Error::Processing("VAD segment start_time < current_time".into())
217 })?;
218
219 chunks.push(Self::create_silence_chunk(
220 audio,
221 sample_rate,
222 current_time,
223 silence_duration,
224 audio_start,
225 )?);
226 }
227
228 let segment_chunks = self.process_speech_segment(
229 audio,
230 sample_rate,
231 segment,
232 noise_baseline,
233 audio_start,
234 )?;
235 chunks.extend(segment_chunks);
236
237 current_time = segment.end_time;
238 }
239
240 let total_end_time = audio_start.add_duration(total_duration);
241 if total_end_time > current_time {
242 let trailing_duration = total_end_time
243 .duration_since(current_time)
244 .ok_or_else(|| Error::Processing("total_end_time < current_time".into()))?;
245 chunks.push(Self::create_silence_chunk(
246 audio,
247 sample_rate,
248 current_time,
249 trailing_duration,
250 audio_start,
251 )?);
252 }
253 }
254
255 let overlap_samples = Self::duration_to_samples(self.config.overlap_duration, sample_rate);
256 apply_overlaps(&mut chunks, overlap_samples, sample_rate);
257
258 let latency = processing_start.elapsed();
259 let chunk_count = chunks.len().max(1);
260 let _per_chunk = Duration::from_secs_f64(latency.as_secs_f64() / chunk_count as f64);
261 for _ in 0..chunk_count {}
262
263 Ok(chunks)
264 }
265}
266
267#[cfg(test)]
268mod tests;