Skip to main content

speech_prep/chunker/
types.rs

1use crate::error::{Error, Result};
2use crate::time::{AudioDuration, AudioTimestamp};
3
4/// Type of boundary at chunk edges.
5///
6/// Indicates the speech context at chunk start/end, enabling downstream
7/// consumers to apply appropriate processing (e.g., cross-fade at transitions).
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum ChunkBoundary {
10    /// Chunk starts at beginning of speech segment.
11    SpeechStart,
12
13    /// Chunk ends at end of speech segment.
14    SpeechEnd,
15
16    /// Chunk is mid-speech (continuation of longer segment).
17    Continuation,
18
19    /// Chunk contains only silence (no speech detected by VAD).
20    Silence,
21}
22
23/// A processed audio chunk with temporal and quality metadata.
24///
25/// Represents a segment of audio aligned to speech boundaries, suitable for
26/// parallel processing by multiple downstream consumers.
27#[derive(Debug, Clone)]
28pub struct ProcessedChunk {
29    /// Audio samples in the chunk (normalized f32, range [-1.0, 1.0]).
30    pub samples: Vec<f32>,
31
32    /// Type of boundary at chunk start.
33    pub start_boundary: ChunkBoundary,
34
35    /// Type of boundary at chunk end.
36    pub end_boundary: ChunkBoundary,
37
38    /// Absolute start time of chunk in audio stream.
39    pub start_time: AudioTimestamp,
40
41    /// Absolute end time of chunk in audio stream.
42    pub end_time: AudioTimestamp,
43
44    /// Ratio of speech frames in chunk (0.0 = all silence, 1.0 = all speech).
45    ///
46    /// Derived from VAD analysis. Used for adaptive routing decisions
47    /// (high speech ratio → GPU tier, low → CPU tier).
48    pub speech_ratio: f32,
49
50    /// RMS energy of the chunk (computed during generation).
51    ///
52    /// Useful for quality assessment and adaptive processing decisions.
53    pub energy: f32,
54
55    /// Signal-to-noise ratio in decibels (dB).
56    ///
57    /// Computed as `20 * log10(signal_rms / noise_rms)`, where `noise_rms` is
58    /// estimated from silence regions. `None` if no noise baseline is available
59    /// (e.g., first chunk with no silence).
60    ///
61    /// Higher SNR values indicate cleaner audio:
62    /// - >30 dB: Excellent quality
63    /// - 20-30 dB: Good quality
64    /// - 10-20 dB: Acceptable quality
65    /// - <10 dB: Poor quality (high noise)
66    pub snr_db: Option<f32>,
67
68    /// Indicates whether the chunk contains clipping artifacts.
69    ///
70    /// Clipping occurs when sample values exceed the normalized range
71    /// [-1.0, 1.0], typically manifesting as |sample| >= 0.999.
72    /// Clipped audio may cause distortion in downstream processing.
73    ///
74    /// `true` if any sample in the chunk is clipped, `false` otherwise.
75    pub has_clipping: bool,
76
77    /// Overlap samples from the previous chunk (for context).
78    ///
79    /// Contains the trailing `overlap_duration` samples from the previous
80    /// chunk, providing acoustic context for downstream speech processing.
81    /// `None` for the first chunk in the stream.
82    pub overlap_prev: Option<Vec<f32>>,
83
84    /// Overlap samples for the next chunk (for context).
85    ///
86    /// Contains the trailing `overlap_duration` samples from this chunk, to be
87    /// prepended to the next chunk for context. `None` for the last chunk in
88    /// the stream.
89    pub overlap_next: Option<Vec<f32>>,
90
91    /// Actual overlap duration in milliseconds.
92    ///
93    /// The duration of samples in `overlap_prev` and `overlap_next`. Typically
94    /// matches `ChunkerConfig::overlap_duration` (default 50ms), but may be
95    /// shorter for chunks at stream boundaries.
96    pub overlap_ms: u32,
97}
98
99impl ProcessedChunk {
100    /// Get the duration of this chunk.
101    ///
102    /// # Errors
103    ///
104    /// Returns `Error::Processing` if `end_time` < `start_time` (indicates
105    /// invalid chunk).
106    pub fn duration(&self) -> Result<AudioDuration> {
107        self.end_time
108            .duration_since(self.start_time)
109            .ok_or_else(|| {
110                Error::Processing("invalid chunk times: end_time precedes start_time".into())
111            })
112    }
113
114    /// Check if this chunk contains primarily speech.
115    #[must_use]
116    pub fn is_speech(&self) -> bool {
117        self.speech_ratio > 0.5
118    }
119
120    /// Check if this chunk is silence.
121    #[must_use]
122    pub fn is_silence(&self) -> bool {
123        self.start_boundary == ChunkBoundary::Silence && self.end_boundary == ChunkBoundary::Silence
124    }
125
126    /// Get samples without overlap (deduplicated core content).
127    ///
128    /// Returns the chunk's primary samples, excluding any overlap regions that
129    /// would be duplicated when processing sequential chunks. Useful for
130    /// downstream consumers that want to avoid processing overlap regions
131    /// twice.
132    pub fn samples_without_overlap(&self) -> &[f32] {
133        &self.samples
134    }
135
136    /// Returns total sample count including overlap regions.
137    ///
138    /// Useful for buffer allocation when reconstructing the full audio data
139    /// with prepended/appended overlaps.
140    #[must_use]
141    pub fn total_samples_with_overlap(&self) -> usize {
142        let prev_overlap = self.overlap_prev.as_ref().map_or(0, Vec::len);
143        let next_overlap = self.overlap_next.as_ref().map_or(0, Vec::len);
144
145        self.samples.len() + prev_overlap + next_overlap
146    }
147}