Skip to main content

speech_prep/chunker/
mod.rs

1//! Audio chunking aligned to VAD segments.
2//!
3//! This module segments standardized PCM audio into speech-aligned chunks with
4//! timing and overlap metadata.
5//!
6//! # Architecture
7//!
8//! The chunker follows a streaming-first design:
9//! 1. Accept VAD boundaries (`SpeechChunk`) + raw PCM samples
10//! 2. Generate fixed-duration chunks (default 500ms) aligned to speech
11//!    boundaries
12//! 3. Attach temporal metadata (`AudioTimestamp`) for deterministic testing
13//! 4. Attach quality metrics such as energy and speech ratio
14//!
15//! # Performance Contracts
16//!
17//! - **Latency**: <15ms total processing per chunk
18//! - **Alignment**: ±20ms accuracy to VAD boundaries
19//! - **Coverage**: Chunks cover 100% of input duration (no gaps)
20//!
21//! # Example
22//!
23//! ```rust
24//! use speech_prep::{Chunker, ChunkerConfig, SpeechChunk};
25//! use speech_prep::time::{AudioDuration, AudioTimestamp};
26//!
27//! let config = ChunkerConfig::default(); // 500ms chunks
28//! let chunker = Chunker::new(config);
29//!
30//! let audio: Vec<f32> = vec![0.0; 16000]; // 1 second @ 16kHz
31//! let vad_segments = vec![SpeechChunk {
32//!     start_time:  AudioTimestamp::EPOCH,
33//!     end_time:    AudioTimestamp::EPOCH
34//!         .add_duration(AudioDuration::from_secs(1)),
35//!     confidence:  0.9,
36//!     avg_energy:  0.5,
37//!     frame_count: 50,
38//! }];
39//!
40//! let chunks = chunker.chunk(&audio, 16000, &vad_segments)?;
41//! assert_eq!(chunks.len(), 2); // Two 500ms chunks from 1s speech
42//!
43//! // Overlaps are automatically added between chunks
44//! assert!(chunks[0].overlap_next.is_some()); // First chunk has overlap for next
45//! assert!(chunks[1].overlap_prev.is_some()); // Second chunk has overlap from prev
46//! # Ok::<(), speech_prep::error::Error>(())
47//! ```
48
49use crate::error::{Error, Result};
50use crate::time::AudioTimestamp;
51use std::time::{Duration, Instant};
52
53use crate::SpeechChunk;
54
55mod analysis;
56mod config;
57mod overlap;
58mod planner;
59mod segments;
60mod types;
61
62pub use config::ChunkerConfig;
63use overlap::apply_overlaps;
64pub use types::{ChunkBoundary, ProcessedChunk};
65
66/// Audio chunker for segmenting streams into processing units.
67///
68/// Combines VAD boundaries with duration heuristics to produce processing-size
69/// chunks.
70#[derive(Debug, Clone, Copy)]
71pub struct Chunker {
72    config: ChunkerConfig,
73}
74
75#[allow(clippy::multiple_inherent_impl)]
76impl Chunker {
77    /// Create a new chunker with the given configuration.
78    #[must_use]
79    pub fn new(config: ChunkerConfig) -> Self {
80        Self { config }
81    }
82
83    /// Create a chunker with default configuration (500ms chunks).
84    ///
85    /// Alias for `Chunker::new(ChunkerConfig::default())`.
86    #[must_use]
87    #[allow(clippy::should_implement_trait)]
88    pub fn default() -> Self {
89        Self::new(ChunkerConfig::default())
90    }
91
92    /// Segment audio into processing chunks aligned to VAD boundaries.
93    ///
94    /// This variant assumes the VAD timestamps use the zero-based origin from
95    /// `AudioTimestamp::EPOCH`. If the timestamps already include a stream
96    /// offset, prefer [`Chunker::chunk_with_stream_start`].
97    ///
98    /// # Arguments
99    ///
100    /// - `audio`: Raw PCM samples (f32, normalized to [-1.0, 1.0])
101    /// - `sample_rate`: Audio sample rate in Hz (must be > 0)
102    /// - `vad_segments`: Speech boundaries from VAD analysis
103    ///
104    /// # Returns
105    ///
106    /// Vector of `ProcessedChunk` covering the entire input duration with no
107    /// gaps.
108    ///
109    /// # Errors
110    ///
111    /// Returns `Error::InvalidInput` if:
112    /// - `sample_rate` is zero
113    /// - `audio` is empty
114    /// - VAD segments have invalid timestamps (end < start)
115    ///
116    /// # Performance
117    ///
118    /// Target: <15ms total processing time per chunk generated.
119    pub fn chunk(
120        &self,
121        audio: &[f32],
122        sample_rate: u32,
123        vad_segments: &[SpeechChunk],
124    ) -> Result<Vec<ProcessedChunk>> {
125        self.chunk_with_stream_start(audio, sample_rate, vad_segments, AudioTimestamp::EPOCH)
126    }
127
128    /// Segment audio into processing chunks with an explicit stream start time.
129    ///
130    /// Use this variant when the VAD timestamps should be interpreted relative
131    /// to a known stream start rather than zero-based.
132    ///
133    /// ```
134    /// use speech_prep::{Chunker, ChunkerConfig, SpeechChunk};
135    /// use speech_prep::time::{AudioDuration, AudioTimestamp};
136    ///
137    /// # fn main() -> speech_prep::error::Result<()> {
138    /// let chunker = Chunker::new(ChunkerConfig::streaming());
139    /// let stream_start = AudioTimestamp::EPOCH;
140    ///
141    /// // VAD emits wall-clock timestamps relative to the live stream
142    /// let segments = vec![SpeechChunk {
143    ///     start_time:  stream_start,
144    ///     end_time:    stream_start.add_duration(AudioDuration::from_millis(240)),
145    ///     confidence:  0.92,
146    ///     avg_energy:  0.4,
147    ///     frame_count: 48,
148    /// }];
149    ///
150    /// let audio = vec![0.0; 3840]; // 240ms @ 16kHz
151    /// let chunks = chunker.chunk_with_stream_start(&audio, 16_000, &segments, stream_start)?;
152    /// assert_eq!(chunks.len(), 1);
153    /// # Ok(())
154    /// # }
155    /// ```
156    pub fn chunk_with_stream_start(
157        &self,
158        audio: &[f32],
159        sample_rate: u32,
160        vad_segments: &[SpeechChunk],
161        stream_start_time: AudioTimestamp,
162    ) -> Result<Vec<ProcessedChunk>> {
163        if sample_rate == 0 {
164            return Err(Error::InvalidInput("sample_rate must be > 0".into()));
165        }
166        if audio.is_empty() {
167            return Err(Error::InvalidInput("audio buffer is empty".into()));
168        }
169
170        for segment in vad_segments {
171            if segment.end_time < segment.start_time {
172                return Err(Error::InvalidInput(
173                    "VAD segment has end_time < start_time".into(),
174                ));
175            }
176        }
177
178        let processing_start = Instant::now();
179
180        let total_samples = audio.len();
181        let total_duration_secs = total_samples as f64 / f64::from(sample_rate);
182        let total_duration = Duration::from_secs_f64(total_duration_secs);
183
184        let earliest_segment_start = vad_segments.iter().map(|seg| seg.start_time).min();
185        let audio_start = earliest_segment_start.map_or(stream_start_time, |start| {
186            std::cmp::min(start, stream_start_time)
187        });
188
189        let noise_baseline =
190            Self::compute_noise_baseline(audio, sample_rate, vad_segments, audio_start);
191
192        let estimated_chunks =
193            (total_duration.as_millis() / self.config.target_duration.as_millis()).max(1) as usize
194                + 1;
195        let mut chunks = Vec::with_capacity(estimated_chunks);
196
197        if vad_segments.is_empty() {
198            chunks.push(Self::create_silence_chunk(
199                audio,
200                sample_rate,
201                audio_start,
202                total_duration,
203                audio_start,
204            )?);
205        } else {
206            let mut current_time = audio_start;
207
208            for segment in vad_segments {
209                if segment.start_time > current_time {
210                    let silence_end = segment.start_time;
211                    let silence_duration =
212                        silence_end.duration_since(current_time).ok_or_else(|| {
213                            Error::Processing("VAD segment start_time < current_time".into())
214                        })?;
215
216                    chunks.push(Self::create_silence_chunk(
217                        audio,
218                        sample_rate,
219                        current_time,
220                        silence_duration,
221                        audio_start,
222                    )?);
223                }
224
225                let segment_chunks = self.process_speech_segment(
226                    audio,
227                    sample_rate,
228                    segment,
229                    noise_baseline,
230                    audio_start,
231                )?;
232                chunks.extend(segment_chunks);
233
234                current_time = segment.end_time;
235            }
236
237            let total_end_time = audio_start.add_duration(total_duration);
238            if total_end_time > current_time {
239                let trailing_duration = total_end_time
240                    .duration_since(current_time)
241                    .ok_or_else(|| Error::Processing("total_end_time < current_time".into()))?;
242                chunks.push(Self::create_silence_chunk(
243                    audio,
244                    sample_rate,
245                    current_time,
246                    trailing_duration,
247                    audio_start,
248                )?);
249            }
250        }
251
252        let overlap_samples = Self::duration_to_samples(self.config.overlap_duration, sample_rate);
253        apply_overlaps(&mut chunks, overlap_samples, sample_rate);
254
255        let latency = processing_start.elapsed();
256        let chunk_count = chunks.len().max(1);
257        let _per_chunk = Duration::from_secs_f64(latency.as_secs_f64() / chunk_count as f64);
258        for _ in 0..chunk_count {}
259
260        Ok(chunks)
261    }
262}
263
264#[cfg(test)]
265mod tests;