Skip to main content

svod_model/audio/
splitter.rs

1//! Chunking abstraction for long-form audio inference.
2//!
3//! `Splitter` is the pluggable boundary between a raw waveform and the
4//! encoder-bounded chunks an ASR inference loop consumes. The trait + the
5//! built-in [`FixedLengthSplitter`] live here (audio-level preprocessing,
6//! same module family as the mel front-end); VAD-driven implementations
7//! ship next to their VAD model — see
8//! [`SileroVadSplitter`](crate::silero_vad::SileroVadSplitter).
9//!
10//! [`Transcriber`](crate::gigaam::Transcriber) is generic over the splitter;
11//! `S::Error` flows through `TranscribeError<E>` the same way
12//! `RnntDecodeError<JitError>` carries the step-backend error.
13//!
14//! Users with pre-segmented audio (pyannote, manual cuts) can skip the
15//! splitter entirely via
16//! [`Transcriber::transcribe_chunks`](crate::gigaam::Transcriber::transcribe_chunks).
17
18pub use svod_arch::vad::AudioChunk;
19
20/// Encoder-derived bounds passed to [`Splitter::split`].
21///
22/// Carries the model-config primitives (not derived seconds counts) so
23/// splitters can reason in whichever unit fits them. Helpers
24/// [`max_samples`](Self::max_samples), [`align_to_samples`](Self::align_to_samples),
25/// and [`encoder_capacity_secs`](Self::encoder_capacity_secs) cover the common
26/// derivations.
27///
28/// The `2 * subsampling_factor` headroom that
29/// [`encoder_capacity_secs`](Self::encoder_capacity_secs) subtracts mirrors the
30/// JIT prepare loop's `subs_output_length` margin — a chunk filling
31/// `max_samples()` is guaranteed to fit through the subsampling stack without
32/// padding overflow.
33#[derive(Clone, Copy, Debug)]
34pub struct EncoderBounds {
35    pub sample_rate: u32,
36    pub hop_length: usize,
37    pub subsampling_factor: usize,
38    pub max_mel_frames: usize,
39}
40
41impl EncoderBounds {
42    /// Sample-domain stride alignment: `hop_length * subsampling_factor`.
43    /// Splitters that produce frame-aligned chunks should snap boundaries to
44    /// this multiple.
45    pub fn align_to_samples(&self) -> usize {
46        self.hop_length * self.subsampling_factor
47    }
48
49    /// Maximum chunk length (in samples) the encoder can ingest. Equals
50    /// `(max_mel_frames - 2 * subsampling_factor) * hop_length` — the headroom
51    /// subtraction matches the JIT prepare path.
52    pub fn max_samples(&self) -> usize {
53        self.max_mel_frames.saturating_sub(2 * self.subsampling_factor) * self.hop_length
54    }
55
56    /// Convenience for splitters that reason in wall-clock seconds.
57    pub fn encoder_capacity_secs(&self) -> f32 {
58        self.max_samples() as f32 / self.sample_rate as f32
59    }
60}
61
62/// Chunking strategy: turn a waveform into encoder-bounded `AudioChunk`s.
63///
64/// `split` is called once per
65/// [`Transcriber::transcribe`](crate::gigaam::Transcriber::transcribe) call.
66/// Implementations may keep mutable state across calls (the Silero VAD JIT
67/// carries LSTM state internally), hence `&mut self`. Chunks must satisfy
68/// `end_sample <= waveform.len()` and `end_sample - start_sample <=
69/// bounds.max_samples()`; alignment is a soft preference (floor-division on
70/// `hop_length` tolerates misaligned boundaries).
71///
72/// Splitters that align chunk ends to encoder strides
73/// ([`align_to_samples`](EncoderBounds::align_to_samples)) can round the
74/// trailing chunk past `waveform.len()`. The contract still requires
75/// in-range chunks — use [`trim_chunks_to_waveform`] to clean up the tail
76/// before returning.
77pub trait Splitter {
78    type Error: std::error::Error + Send + Sync + 'static;
79
80    fn split(&mut self, waveform: &[f32], bounds: &EncoderBounds) -> Result<Vec<AudioChunk>, Self::Error>;
81
82    /// Upper bound (in samples) on the longest chunk this splitter could
83    /// emit. Consumed by the transcriber to size JIT buffers — a tighter
84    /// advertised bound trades peak memory for tighter chunk handling.
85    /// Default: full encoder capacity.
86    fn max_chunk_samples(&self, bounds: &EncoderBounds) -> usize {
87        bounds.max_samples()
88    }
89}
90
91/// Trim `chunks` so every entry stays within `0..waveform_len`: drop chunks
92/// starting at or past `waveform_len`, then clamp the trailing chunk's
93/// `end_sample` down to `waveform_len`. Intended for [`Splitter`]
94/// implementations whose stride alignment can push the last chunk past the
95/// waveform end (see the trait docs).
96///
97/// Assumes `chunks` is in increasing `start_sample` order — the
98/// `Splitter::split` contract — so a single pass from the back is enough.
99pub fn trim_chunks_to_waveform(chunks: &mut Vec<AudioChunk>, waveform_len: usize) {
100    while let Some(mut last) = chunks.pop() {
101        if last.start_sample >= waveform_len {
102            continue;
103        }
104        last.end_sample = last.end_sample.min(waveform_len);
105        chunks.push(last);
106        break;
107    }
108}
109
110/// No-VAD splitter: walks the waveform in `bounds.max_samples()`-sized
111/// strides, aligning non-final chunks to `bounds.align_to_samples()`.
112///
113/// Zero model load. Suitable when the caller already segmented the input,
114/// for short utterances that fit a single chunk, or for tests. Boundary
115/// context degrades transcription quality at chunk seams — for production
116/// long-form ASR prefer
117/// [`SileroVadSplitter`](crate::silero_vad::SileroVadSplitter).
118///
119/// `align_to_samples` dividing `max_samples` is not guaranteed; the final
120/// aligned chunk is the floor of `len_remaining / align_to_samples` times
121/// `align_to_samples` (so its mel length stays an integer multiple of
122/// `subsampling_factor`). The very last chunk keeps its unaligned tail —
123/// the JIT pads it.
124#[derive(Clone, Debug, Default)]
125pub struct FixedLengthSplitter {
126    // Field-form (not a unit struct) so v2 can add `overlap_samples` without
127    // a breaking API change.
128}
129
130impl FixedLengthSplitter {
131    pub fn new() -> Self {
132        Self {}
133    }
134}
135
136impl Splitter for FixedLengthSplitter {
137    type Error = std::convert::Infallible;
138
139    fn split(&mut self, waveform: &[f32], bounds: &EncoderBounds) -> Result<Vec<AudioChunk>, Self::Error> {
140        if waveform.is_empty() {
141            return Ok(Vec::new());
142        }
143        let align = bounds.align_to_samples().max(1);
144        // Saturating max_samples so a misconfigured bounds (zero, overflow)
145        // never stalls the loop. Floor at `align` so the loop always advances.
146        let max = bounds.max_samples().max(align);
147
148        let mut chunks = Vec::new();
149        let mut start = 0usize;
150        while start < waveform.len() {
151            let nominal_end = start.saturating_add(max).min(waveform.len());
152            let aligned_end = if nominal_end == waveform.len() {
153                nominal_end
154            } else {
155                let span = nominal_end - start;
156                let aligned_span = (span / align) * align;
157                start + aligned_span.max(align)
158            };
159            chunks.push(AudioChunk { start_sample: start, end_sample: aligned_end });
160            start = aligned_end;
161        }
162        Ok(chunks)
163    }
164}