speech_prep/chunker/types.rs
1use crate::error::{Error, Result};
2use crate::time::{AudioDuration, AudioTimestamp};
3
4/// Type of boundary at chunk edges.
5///
6/// Indicates the speech context at chunk start/end, enabling downstream
7/// consumers to apply appropriate processing (e.g., cross-fade at transitions).
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum ChunkBoundary {
10 /// Chunk starts at beginning of speech segment.
11 SpeechStart,
12
13 /// Chunk ends at end of speech segment.
14 SpeechEnd,
15
16 /// Chunk is mid-speech (continuation of longer segment).
17 Continuation,
18
19 /// Chunk contains only silence (no speech detected by VAD).
20 Silence,
21}
22
23/// A processed audio chunk with temporal and quality metadata.
24///
25/// Represents a segment of audio aligned to speech boundaries, suitable for
26/// parallel processing by multiple downstream consumers.
27#[derive(Debug, Clone)]
28pub struct ProcessedChunk {
29 /// Audio samples in the chunk (normalized f32, range [-1.0, 1.0]).
30 pub samples: Vec<f32>,
31
32 /// Type of boundary at chunk start.
33 pub start_boundary: ChunkBoundary,
34
35 /// Type of boundary at chunk end.
36 pub end_boundary: ChunkBoundary,
37
38 /// Absolute start time of chunk in audio stream.
39 pub start_time: AudioTimestamp,
40
41 /// Absolute end time of chunk in audio stream.
42 pub end_time: AudioTimestamp,
43
44 /// Ratio of speech frames in chunk (0.0 = all silence, 1.0 = all speech).
45 ///
46 /// Derived from VAD analysis. Used for adaptive routing decisions
47 /// (high speech ratio → GPU tier, low → CPU tier).
48 pub speech_ratio: f32,
49
50 /// RMS energy of the chunk (computed during generation).
51 ///
52 /// Useful for quality assessment and adaptive processing decisions.
53 pub energy: f32,
54
55 /// Signal-to-noise ratio in decibels (dB).
56 ///
57 /// Computed as `20 * log10(signal_rms / noise_rms)`, where `noise_rms` is
58 /// estimated from silence regions. `None` if no noise baseline is available
59 /// (e.g., first chunk with no silence).
60 ///
61 /// Higher SNR values indicate cleaner audio:
62 /// - >30 dB: Excellent quality
63 /// - 20-30 dB: Good quality
64 /// - 10-20 dB: Acceptable quality
65 /// - <10 dB: Poor quality (high noise)
66 pub snr_db: Option<f32>,
67
68 /// Indicates whether the chunk contains clipping artifacts.
69 ///
70 /// Clipping occurs when sample values exceed the normalized range
71 /// [-1.0, 1.0], typically manifesting as |sample| >= 0.999.
72 /// Clipped audio may cause distortion in downstream processing.
73 ///
74 /// `true` if any sample in the chunk is clipped, `false` otherwise.
75 pub has_clipping: bool,
76
77 /// Overlap samples from the previous chunk (for context).
78 ///
79 /// Contains the trailing `overlap_duration` samples from the previous
80 /// chunk, providing acoustic context for downstream speech processing.
81 /// `None` for the first chunk in the stream.
82 pub overlap_prev: Option<Vec<f32>>,
83
84 /// Overlap samples for the next chunk (for context).
85 ///
86 /// Contains the trailing `overlap_duration` samples from this chunk, to be
87 /// prepended to the next chunk for context. `None` for the last chunk in
88 /// the stream.
89 pub overlap_next: Option<Vec<f32>>,
90
91 /// Actual overlap duration in milliseconds.
92 ///
93 /// The duration of samples in `overlap_prev` and `overlap_next`. Typically
94 /// matches `ChunkerConfig::overlap_duration` (default 50ms), but may be
95 /// shorter for chunks at stream boundaries.
96 pub overlap_ms: u32,
97}
98
99impl ProcessedChunk {
100 /// Get the duration of this chunk.
101 ///
102 /// # Errors
103 ///
104 /// Returns `Error::Processing` if `end_time` < `start_time` (indicates
105 /// invalid chunk).
106 pub fn duration(&self) -> Result<AudioDuration> {
107 self.end_time
108 .duration_since(self.start_time)
109 .ok_or_else(|| {
110 Error::Processing("invalid chunk times: end_time precedes start_time".into())
111 })
112 }
113
114 /// Check if this chunk contains primarily speech.
115 #[must_use]
116 pub fn is_speech(&self) -> bool {
117 self.speech_ratio > 0.5
118 }
119
120 /// Check if this chunk is silence.
121 #[must_use]
122 pub fn is_silence(&self) -> bool {
123 self.start_boundary == ChunkBoundary::Silence && self.end_boundary == ChunkBoundary::Silence
124 }
125
126 /// Get samples without overlap (deduplicated core content).
127 ///
128 /// Returns the chunk's primary samples, excluding any overlap regions that
129 /// would be duplicated when processing sequential chunks. Useful for
130 /// downstream consumers that want to avoid processing overlap regions
131 /// twice.
132 pub fn samples_without_overlap(&self) -> &[f32] {
133 &self.samples
134 }
135
136 /// Returns total sample count including overlap regions.
137 ///
138 /// Useful for buffer allocation when reconstructing the full audio data
139 /// with prepended/appended overlaps.
140 #[must_use]
141 pub fn total_samples_with_overlap(&self) -> usize {
142 let prev_overlap = self.overlap_prev.as_ref().map_or(0, Vec::len);
143 let next_overlap = self.overlap_next.as_ref().map_or(0, Vec::len);
144
145 self.samples.len() + prev_overlap + next_overlap
146 }
147}