Skip to main content

speech_prep/chunker/
mod.rs

1//! Audio chunking pipeline for multi-consumer processing.
2//!
3//! This module segments standardized PCM audio into speech-aligned chunks with
4//! rich metadata, enabling parallel downstream processing by speech
5//! recognition, scoring, and alignment pipelines.
6//!
7//! # Architecture
8//!
9//! The chunker follows a streaming-first design:
10//! 1. Accept VAD boundaries (`SpeechChunk`) + raw PCM samples
11//! 2. Generate fixed-duration chunks (default 500ms) aligned to speech
12//!    boundaries
13//! 3. Attach temporal metadata (`AudioTimestamp`) for deterministic testing
14//! 4. Provide quality metrics (energy, speech ratio) for adaptive routing
15//!
16//! # Performance Contracts
17//!
18//! - **Latency**: <15ms total processing per chunk
19//! - **Alignment**: ±20ms accuracy to VAD boundaries
20//! - **Coverage**: Chunks cover 100% of input duration (no gaps)
21//!
22//! # Example
23//!
24//! ```rust
25//! use speech_prep::{Chunker, ChunkerConfig, SpeechChunk};
26//! use speech_prep::time::{AudioDuration, AudioTimestamp};
27//!
28//! let config = ChunkerConfig::default(); // 500ms chunks
29//! let chunker = Chunker::new(config);
30//!
31//! let audio: Vec<f32> = vec![0.0; 16000]; // 1 second @ 16kHz
32//! let vad_segments = vec![SpeechChunk {
33//!     start_time:  AudioTimestamp::EPOCH,
34//!     end_time:    AudioTimestamp::EPOCH
35//!         .add_duration(AudioDuration::from_secs(1)),
36//!     confidence:  0.9,
37//!     avg_energy:  0.5,
38//!     frame_count: 50,
39//! }];
40//!
41//! let chunks = chunker.chunk(&audio, 16000, &vad_segments)?;
42//! assert_eq!(chunks.len(), 2); // Two 500ms chunks from 1s speech
43//!
44//! // Overlaps are automatically added between chunks
45//! assert!(chunks[0].overlap_next.is_some()); // First chunk has overlap for next
46//! assert!(chunks[1].overlap_prev.is_some()); // Second chunk has overlap from prev
47//! # Ok::<(), speech_prep::error::Error>(())
48//! ```
49
50use crate::error::{Error, Result};
51use crate::time::AudioTimestamp;
52use std::time::{Duration, Instant};
53
54use crate::SpeechChunk;
55
56mod analysis;
57mod config;
58mod overlap;
59mod planner;
60mod segments;
61mod types;
62
63pub use config::ChunkerConfig;
64use overlap::apply_overlaps;
65pub use types::{ChunkBoundary, ProcessedChunk};
66
67/// Audio chunker for segmenting streams into processing units.
68///
69/// Combines VAD boundaries with duration heuristics to produce chunks optimized
70/// for downstream processing by downstream consumers.
71#[derive(Debug, Clone, Copy)]
72pub struct Chunker {
73    config: ChunkerConfig,
74}
75
76#[allow(clippy::multiple_inherent_impl)]
77impl Chunker {
78    /// Create a new chunker with the given configuration.
79    #[must_use]
80    pub fn new(config: ChunkerConfig) -> Self {
81        Self { config }
82    }
83
84    /// Create a chunker with default configuration (500ms chunks).
85    ///
86    /// Alias for `Chunker::new(ChunkerConfig::default())`.
87    #[must_use]
88    #[allow(clippy::should_implement_trait)]
89    pub fn default() -> Self {
90        Self::new(ChunkerConfig::default())
91    }
92
93    /// Segment audio into processing chunks aligned to VAD boundaries.
94    ///
95    /// This variant assumes that VAD timestamps are relative to the Unix epoch
96    /// (e.g., tests that build times off `AudioTimestamp::EPOCH`). For streaming
97    /// scenarios where VAD emits wall-clock timestamps (`AudioTimestamp::now()`),
98    /// prefer [`Chunker::chunk_with_stream_start`] so the chunker can normalize
99    /// against the actual stream start.
100    ///
101    /// # Arguments
102    ///
103    /// - `audio`: Raw PCM samples (f32, normalized to [-1.0, 1.0])
104    /// - `sample_rate`: Audio sample rate in Hz (must be > 0)
105    /// - `vad_segments`: Speech boundaries from VAD analysis
106    ///
107    /// # Returns
108    ///
109    /// Vector of `ProcessedChunk` covering the entire input duration with no
110    /// gaps.
111    ///
112    /// # Errors
113    ///
114    /// Returns `Error::InvalidInput` if:
115    /// - `sample_rate` is zero
116    /// - `audio` is empty
117    /// - VAD segments have invalid timestamps (end < start)
118    ///
119    /// # Performance
120    ///
121    /// Target: <15ms total processing time per chunk generated.
122    pub fn chunk(
123        &self,
124        audio: &[f32],
125        sample_rate: u32,
126        vad_segments: &[SpeechChunk],
127    ) -> Result<Vec<ProcessedChunk>> {
128        self.chunk_with_stream_start(audio, sample_rate, vad_segments, AudioTimestamp::EPOCH)
129    }
130
131    /// Segment audio into processing chunks with an explicit stream start time.
132    ///
133    /// Use this variant when the VAD timestamps are absolute (e.g., wall-clock)
134    /// rather than relative to the Unix epoch.
135    ///
136    /// ```
137    /// use speech_prep::{Chunker, ChunkerConfig, SpeechChunk};
138    /// use speech_prep::time::{AudioDuration, AudioTimestamp};
139    ///
140    /// # fn main() -> speech_prep::error::Result<()> {
141    /// let chunker = Chunker::new(ChunkerConfig::streaming());
142    /// let stream_start = AudioTimestamp::EPOCH;
143    ///
144    /// // VAD emits wall-clock timestamps relative to the live stream
145    /// let segments = vec![SpeechChunk {
146    ///     start_time:  stream_start,
147    ///     end_time:    stream_start.add_duration(AudioDuration::from_millis(240)),
148    ///     confidence:  0.92,
149    ///     avg_energy:  0.4,
150    ///     frame_count: 48,
151    /// }];
152    ///
153    /// let audio = vec![0.0; 3840]; // 240ms @ 16kHz
154    /// let chunks = chunker.chunk_with_stream_start(&audio, 16_000, &segments, stream_start)?;
155    /// assert_eq!(chunks.len(), 1);
156    /// # Ok(())
157    /// # }
158    /// ```
159    pub fn chunk_with_stream_start(
160        &self,
161        audio: &[f32],
162        sample_rate: u32,
163        vad_segments: &[SpeechChunk],
164        stream_start_time: AudioTimestamp,
165    ) -> Result<Vec<ProcessedChunk>> {
166        if sample_rate == 0 {
167            return Err(Error::InvalidInput("sample_rate must be > 0".into()));
168        }
169        if audio.is_empty() {
170            return Err(Error::InvalidInput("audio buffer is empty".into()));
171        }
172
173        for segment in vad_segments {
174            if segment.end_time < segment.start_time {
175                return Err(Error::InvalidInput(
176                    "VAD segment has end_time < start_time".into(),
177                ));
178            }
179        }
180
181        let processing_start = Instant::now();
182
183        let total_samples = audio.len();
184        let total_duration_secs = total_samples as f64 / f64::from(sample_rate);
185        let total_duration = Duration::from_secs_f64(total_duration_secs);
186
187        let earliest_segment_start = vad_segments.iter().map(|seg| seg.start_time).min();
188        let audio_start = earliest_segment_start.map_or(stream_start_time, |start| {
189            std::cmp::min(start, stream_start_time)
190        });
191
192        let noise_baseline =
193            Self::compute_noise_baseline(audio, sample_rate, vad_segments, audio_start);
194
195        let estimated_chunks =
196            (total_duration.as_millis() / self.config.target_duration.as_millis()).max(1) as usize
197                + 1;
198        let mut chunks = Vec::with_capacity(estimated_chunks);
199
200        if vad_segments.is_empty() {
201            chunks.push(Self::create_silence_chunk(
202                audio,
203                sample_rate,
204                audio_start,
205                total_duration,
206                audio_start,
207            )?);
208        } else {
209            let mut current_time = audio_start;
210
211            for segment in vad_segments {
212                if segment.start_time > current_time {
213                    let silence_end = segment.start_time;
214                    let silence_duration =
215                        silence_end.duration_since(current_time).ok_or_else(|| {
216                            Error::Processing("VAD segment start_time < current_time".into())
217                        })?;
218
219                    chunks.push(Self::create_silence_chunk(
220                        audio,
221                        sample_rate,
222                        current_time,
223                        silence_duration,
224                        audio_start,
225                    )?);
226                }
227
228                let segment_chunks = self.process_speech_segment(
229                    audio,
230                    sample_rate,
231                    segment,
232                    noise_baseline,
233                    audio_start,
234                )?;
235                chunks.extend(segment_chunks);
236
237                current_time = segment.end_time;
238            }
239
240            let total_end_time = audio_start.add_duration(total_duration);
241            if total_end_time > current_time {
242                let trailing_duration = total_end_time
243                    .duration_since(current_time)
244                    .ok_or_else(|| Error::Processing("total_end_time < current_time".into()))?;
245                chunks.push(Self::create_silence_chunk(
246                    audio,
247                    sample_rate,
248                    current_time,
249                    trailing_duration,
250                    audio_start,
251                )?);
252            }
253        }
254
255        let overlap_samples = Self::duration_to_samples(self.config.overlap_duration, sample_rate);
256        apply_overlaps(&mut chunks, overlap_samples, sample_rate);
257
258        let latency = processing_start.elapsed();
259        let chunk_count = chunks.len().max(1);
260        let _per_chunk = Duration::from_secs_f64(latency.as_secs_f64() / chunk_count as f64);
261        for _ in 0..chunk_count {}
262
263        Ok(chunks)
264    }
265}
266
267#[cfg(test)]
268mod tests;