use crate::vad::Vad;
use crate::{SpeechModel, TranscribeError, TranscribeOptions, TranscriptionResult};
use super::merge::merge_sequential_with_separator;
use super::{rms_energy, transcribe_padded, Transcriber, SAMPLE_RATE};
pub struct VadChunkedConfig {
pub min_chunk_secs: f32,
pub max_chunk_secs: f32,
pub padding_secs: f32,
pub smart_split_search_secs: Option<f32>,
pub merge_separator: String,
}
impl Default for VadChunkedConfig {
fn default() -> Self {
Self {
min_chunk_secs: 1.0,
max_chunk_secs: 30.0,
padding_secs: 0.0,
smart_split_search_secs: None,
merge_separator: " ".into(),
}
}
}
pub struct VadChunked {
vad: Box<dyn Vad>,
config: VadChunkedConfig,
options: TranscribeOptions,
speech_buffer: Vec<f32>,
pending: Vec<f32>,
in_speech: bool,
elapsed_samples: usize,
speech_start_sample: Option<usize>,
chunk_index: usize,
results: Vec<TranscriptionResult>,
}
impl VadChunked {
pub fn new(vad: Box<dyn Vad>, config: VadChunkedConfig, options: TranscribeOptions) -> Self {
Self {
vad,
config,
options,
speech_buffer: Vec::new(),
pending: Vec::new(),
in_speech: false,
elapsed_samples: 0,
speech_start_sample: None,
chunk_index: 0,
results: Vec::new(),
}
}
fn smart_split_buffer(
&mut self,
model: &mut dyn SpeechModel,
search_secs: f32,
) -> Result<TranscriptionResult, TranscribeError> {
let frame_size = self.vad.frame_size();
let search_samples = (search_secs * SAMPLE_RATE) as usize;
let buf_len = self.speech_buffer.len();
let search_start = buf_len.saturating_sub(search_samples);
let search_start = (search_start / frame_size) * frame_size;
let mut min_rms = f32::MAX;
let mut best_offset = buf_len;
let mut offset = search_start;
while offset + frame_size <= buf_len {
let frame = &self.speech_buffer[offset..offset + frame_size];
let rms = rms_energy(frame);
if rms < min_rms {
min_rms = rms;
best_offset = offset + frame_size; }
offset += frame_size;
}
log::info!(
"smart split: search window {:.2}s, best split at {:.2}s (rms={:.4}), buffer={:.2}s",
search_secs,
best_offset as f32 / SAMPLE_RATE,
min_rms,
buf_len as f32 / SAMPLE_RATE,
);
let chunk: Vec<f32> = self.speech_buffer.drain(..best_offset).collect();
let chunk_start_secs = self.speech_start_sample.unwrap_or_else(|| {
self.elapsed_samples
.saturating_sub(self.speech_buffer.len() + chunk.len())
}) as f32
/ SAMPLE_RATE;
if self.speech_buffer.is_empty() {
self.speech_start_sample = None;
} else {
self.speech_start_sample = self.speech_start_sample.map(|s| s + best_offset);
}
self.transcribe_chunk(model, chunk, chunk_start_secs)
}
fn flush_speech_buffer(
&mut self,
model: &mut dyn SpeechModel,
) -> Result<TranscriptionResult, TranscribeError> {
let samples = std::mem::take(&mut self.speech_buffer);
let chunk_start_secs = self
.speech_start_sample
.unwrap_or_else(|| self.elapsed_samples.saturating_sub(samples.len()))
as f32
/ SAMPLE_RATE;
self.speech_start_sample = None;
self.transcribe_chunk(model, samples, chunk_start_secs)
}
fn transcribe_chunk(
&mut self,
model: &mut dyn SpeechModel,
samples: Vec<f32>,
chunk_start_secs: f32,
) -> Result<TranscriptionResult, TranscribeError> {
log::info!(
"chunk {}: start={:.2}s duration={:.2}s samples={} padding={:.0}ms",
self.chunk_index,
chunk_start_secs,
samples.len() as f32 / SAMPLE_RATE,
samples.len(),
self.config.padding_secs * 1000.0,
);
self.chunk_index += 1;
let result = transcribe_padded(
model,
&samples,
self.config.padding_secs,
self.config.min_chunk_secs,
chunk_start_secs,
&self.options,
)?;
log::info!(" -> \"{}\"", result.text.trim());
self.results.push(result.clone());
Ok(result)
}
fn finish_inner(
&mut self,
model: &mut dyn SpeechModel,
) -> Result<TranscriptionResult, TranscribeError> {
if !self.pending.is_empty() {
let pending = std::mem::take(&mut self.pending);
if self.speech_buffer.is_empty() && self.speech_start_sample.is_none() {
self.speech_start_sample = Some(self.elapsed_samples);
}
self.speech_buffer.extend_from_slice(&pending);
self.elapsed_samples += pending.len();
}
if !self.speech_buffer.is_empty() {
log::info!(
"finish: transcribing remaining buffer ({:.2}s)",
self.speech_buffer.len() as f32 / SAMPLE_RATE
);
self.flush_speech_buffer(model)?;
}
log::info!("session complete: {} chunks transcribed", self.chunk_index);
Ok(merge_sequential_with_separator(
&self.results,
&self.config.merge_separator,
))
}
fn reset_state(&mut self) {
self.results.clear();
self.speech_buffer.clear();
self.pending.clear();
self.in_speech = false;
self.elapsed_samples = 0;
self.speech_start_sample = None;
self.chunk_index = 0;
self.vad.reset();
}
}
impl Transcriber for VadChunked {
fn feed(
&mut self,
model: &mut dyn SpeechModel,
samples: &[f32],
) -> Result<Vec<TranscriptionResult>, TranscribeError> {
let frame_size = self.vad.frame_size();
let mut new_results = Vec::new();
let combined;
let to_process = if self.pending.is_empty() {
samples
} else {
self.pending.extend_from_slice(samples);
combined = std::mem::take(&mut self.pending);
&combined
};
for frame in to_process.chunks(frame_size) {
if frame.len() < frame_size {
self.pending.extend_from_slice(frame);
continue;
}
let is_speech = self.vad.is_speech(frame)?;
self.elapsed_samples += frame_size;
if is_speech {
if !self.in_speech {
let prefill = self.vad.drain_prefill();
if self.speech_start_sample.is_none() {
self.speech_start_sample =
Some((self.elapsed_samples - frame_size).saturating_sub(prefill.len()));
}
self.speech_buffer.extend_from_slice(&prefill);
}
self.speech_buffer.extend_from_slice(frame);
self.in_speech = true;
let chunk_secs = self.speech_buffer.len() as f32 / SAMPLE_RATE;
if chunk_secs >= self.config.max_chunk_secs {
log::info!(
"force-splitting at {:.2}s (max_chunk_secs={:.2})",
self.elapsed_samples as f32 / SAMPLE_RATE,
self.config.max_chunk_secs
);
let result = if let Some(search_secs) = self.config.smart_split_search_secs {
self.smart_split_buffer(model, search_secs)?
} else {
self.flush_speech_buffer(model)?
};
new_results.push(result);
}
} else if self.in_speech {
self.in_speech = false;
if !self.speech_buffer.is_empty() {
let chunk_secs = self.speech_buffer.len() as f32 / SAMPLE_RATE;
if chunk_secs >= self.config.min_chunk_secs {
log::info!(
"speech->silence at {:.2}s, chunk buffered={:.2}s",
self.elapsed_samples as f32 / SAMPLE_RATE,
chunk_secs
);
new_results.push(self.flush_speech_buffer(model)?);
} else {
log::debug!(
"carrying forward short chunk ({:.2}s < min {:.2}s)",
chunk_secs,
self.config.min_chunk_secs
);
}
}
}
}
Ok(new_results)
}
fn finish(
&mut self,
model: &mut dyn SpeechModel,
) -> Result<TranscriptionResult, TranscribeError> {
let result = self.finish_inner(model);
self.reset_state();
result
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::transcriber::test_helpers::{make_silence, make_speech, FailOnNthModel, MockModel};
use crate::vad::{EnergyVad, SmoothedVad};
#[test]
fn vad_chunked_basic_speech_then_silence() {
let vad = EnergyVad::new(480, 0.01);
let config = VadChunkedConfig {
min_chunk_secs: 0.0, ..Default::default()
};
let mut t = VadChunked::new(Box::new(vad), config, TranscribeOptions::default());
let mut model = MockModel;
let speech = make_speech(480, 10); let results = t.feed(&mut model, &speech).unwrap();
assert!(results.is_empty());
let silence = make_silence(480, 5);
let results = t.feed(&mut model, &silence).unwrap();
assert_eq!(results.len(), 1);
assert_eq!(results[0].text, "chunk_4800"); }
#[test]
fn vad_chunked_finish_transcribes_remainder() {
let vad = EnergyVad::new(480, 0.01);
let config = VadChunkedConfig {
min_chunk_secs: 0.0,
..Default::default()
};
let mut t = VadChunked::new(Box::new(vad), config, TranscribeOptions::default());
let mut model = MockModel;
let speech = make_speech(480, 10);
let results = t.feed(&mut model, &speech).unwrap();
assert!(results.is_empty());
let final_result = t.finish(&mut model).unwrap();
assert_eq!(final_result.text, "chunk_4800");
}
#[test]
fn vad_chunked_max_duration_force_splits() {
let vad = EnergyVad::new(480, 0.01);
let config = VadChunkedConfig {
min_chunk_secs: 0.0,
max_chunk_secs: 0.06, ..Default::default()
};
let mut t = VadChunked::new(Box::new(vad), config, TranscribeOptions::default());
let mut model = MockModel;
let speech = make_speech(480, 10);
let results = t.feed(&mut model, &speech).unwrap();
assert!(results.len() >= 4); }
#[test]
fn vad_chunked_short_speech_carries_forward() {
let vad = EnergyVad::new(480, 0.01);
let config = VadChunkedConfig {
min_chunk_secs: 1.0, ..Default::default()
};
let mut t = VadChunked::new(Box::new(vad), config, TranscribeOptions::default());
let mut model = MockModel;
let speech = make_speech(480, 1);
t.feed(&mut model, &speech).unwrap();
let silence = make_silence(480, 5);
let results = t.feed(&mut model, &silence).unwrap();
assert!(results.is_empty());
let speech2 = make_speech(480, 40); t.feed(&mut model, &speech2).unwrap();
let silence2 = make_silence(480, 5);
let results = t.feed(&mut model, &silence2).unwrap();
assert_eq!(results.len(), 1);
assert_eq!(results[0].text, "chunk_19680");
}
#[test]
fn vad_chunked_carry_forward_timestamp_correct() {
let vad = EnergyVad::new(480, 0.01);
let config = VadChunkedConfig {
min_chunk_secs: 1.0,
..Default::default()
};
let mut t = VadChunked::new(Box::new(vad), config, TranscribeOptions::default());
let mut model = MockModel;
let speech = make_speech(480, 1);
t.feed(&mut model, &speech).unwrap();
let silence = make_silence(480, 5);
t.feed(&mut model, &silence).unwrap();
let speech2 = make_speech(480, 40);
t.feed(&mut model, &speech2).unwrap();
let silence2 = make_silence(480, 5);
let results = t.feed(&mut model, &silence2).unwrap();
assert_eq!(results.len(), 1);
let segs = results[0].segments.as_ref().unwrap();
assert!(
segs[0].start < 0.01,
"expected start near 0.0, got {}",
segs[0].start
);
}
#[test]
fn vad_chunked_timestamps_adjusted() {
let vad = EnergyVad::new(480, 0.01);
let config = VadChunkedConfig {
min_chunk_secs: 0.0,
..Default::default()
};
let mut t = VadChunked::new(Box::new(vad), config, TranscribeOptions::default());
let mut model = MockModel;
let silence = make_silence(480, 34);
t.feed(&mut model, &silence).unwrap();
let speech = make_speech(480, 10);
t.feed(&mut model, &speech).unwrap();
let silence2 = make_silence(480, 5);
let results = t.feed(&mut model, &silence2).unwrap();
assert_eq!(results.len(), 1);
let segs = results[0].segments.as_ref().unwrap();
assert!(segs[0].start > 0.9); }
#[test]
fn vad_chunked_timestamps_clamped_to_zero() {
let vad = EnergyVad::new(480, 0.01);
let config = VadChunkedConfig {
min_chunk_secs: 0.0,
padding_secs: 0.5, ..Default::default()
};
let mut t = VadChunked::new(Box::new(vad), config, TranscribeOptions::default());
let mut model = MockModel;
let speech = make_speech(480, 10);
t.feed(&mut model, &speech).unwrap();
let silence = make_silence(480, 5);
let results = t.feed(&mut model, &silence).unwrap();
assert_eq!(results.len(), 1);
let segs = results[0].segments.as_ref().unwrap();
assert!(
segs[0].start >= 0.0,
"timestamp should not be negative, got {}",
segs[0].start
);
assert!(
segs[0].end >= 0.0,
"timestamp should not be negative, got {}",
segs[0].end
);
}
#[test]
fn vad_chunked_propagates_transcription_error() {
let vad = EnergyVad::new(480, 0.01);
let config = VadChunkedConfig {
min_chunk_secs: 0.0,
max_chunk_secs: 0.06, ..Default::default()
};
let mut t = VadChunked::new(Box::new(vad), config, TranscribeOptions::default());
let mut model = FailOnNthModel::new(2);
let speech = make_speech(480, 10);
let result = t.feed(&mut model, &speech);
assert!(result.is_err());
}
#[test]
fn vad_chunked_transcribe_convenience() {
let vad = EnergyVad::new(480, 0.01);
let config = VadChunkedConfig {
min_chunk_secs: 0.0,
..Default::default()
};
let mut t = VadChunked::new(Box::new(vad), config, TranscribeOptions::default());
let mut model = MockModel;
let speech = make_speech(480, 10);
let result = t.transcribe(&mut model, &speech).unwrap();
assert!(!result.text.is_empty());
}
#[test]
fn vad_chunked_smart_split_finds_low_energy() {
let vad = EnergyVad::new(480, 0.01);
let config = VadChunkedConfig {
min_chunk_secs: 0.0,
max_chunk_secs: 0.3, smart_split_search_secs: Some(0.15), ..Default::default()
};
let mut t = VadChunked::new(Box::new(vad), config, TranscribeOptions::default());
let mut model = MockModel;
let mut audio = Vec::new();
for i in 0..12 {
let val = if i == 7 { 0.1 } else { 1.0 };
audio.extend(vec![val; 480]);
}
let results = t.feed(&mut model, &audio).unwrap();
assert!(!results.is_empty());
assert_eq!(results[0].text, "chunk_3840");
}
#[test]
fn vad_chunked_smart_split_disabled_hard_cuts() {
let vad = EnergyVad::new(480, 0.01);
let config = VadChunkedConfig {
min_chunk_secs: 0.0,
max_chunk_secs: 0.06, smart_split_search_secs: None, ..Default::default()
};
let mut t = VadChunked::new(Box::new(vad), config, TranscribeOptions::default());
let mut model = MockModel;
let speech = make_speech(480, 6);
let results = t.feed(&mut model, &speech).unwrap();
assert_eq!(results.len(), 3); assert_eq!(results[0].text, "chunk_960"); }
#[test]
fn vad_chunked_multiple_speech_regions() {
let vad = EnergyVad::new(480, 0.01);
let config = VadChunkedConfig {
min_chunk_secs: 0.0,
..Default::default()
};
let mut t = VadChunked::new(Box::new(vad), config, TranscribeOptions::default());
let mut model = MockModel;
let mut audio = Vec::new();
audio.extend(make_speech(480, 5));
audio.extend(make_silence(480, 5));
audio.extend(make_speech(480, 8));
audio.extend(make_silence(480, 5));
let results = t.feed(&mut model, &audio).unwrap();
assert_eq!(results.len(), 2); }
#[test]
fn vad_chunked_object_safe() {
let vad = EnergyVad::new(480, 0.01);
let config = VadChunkedConfig {
min_chunk_secs: 0.0,
..Default::default()
};
let mut transcriber: Box<dyn Transcriber> = Box::new(VadChunked::new(
Box::new(vad),
config,
TranscribeOptions::default(),
));
let mut model = MockModel;
let speech = make_speech(480, 10);
let result = transcriber.transcribe(&mut model, &speech).unwrap();
assert!(!result.text.is_empty());
}
#[test]
fn vad_chunked_prefill_captures_onset_audio() {
let inner = EnergyVad::new(480, 0.01);
let vad = SmoothedVad::new(Box::new(inner), 5, 0, 2);
let config = VadChunkedConfig {
min_chunk_secs: 0.0,
..Default::default()
};
let mut t = VadChunked::new(Box::new(vad), config, TranscribeOptions::default());
let mut model = MockModel;
let speech = make_speech(480, 3);
t.feed(&mut model, &speech).unwrap();
let silence = make_silence(480, 5);
let results = t.feed(&mut model, &silence).unwrap();
assert_eq!(results.len(), 1);
assert_eq!(results[0].text, "chunk_1440");
}
#[test]
fn vad_chunked_prefill_timestamp_accounts_for_prefill() {
let inner = EnergyVad::new(480, 0.01);
let vad = SmoothedVad::new(Box::new(inner), 5, 0, 2);
let config = VadChunkedConfig {
min_chunk_secs: 0.0,
..Default::default()
};
let mut t = VadChunked::new(Box::new(vad), config, TranscribeOptions::default());
let mut model = MockModel;
let silence = make_silence(480, 3);
t.feed(&mut model, &silence).unwrap();
let speech = make_speech(480, 3);
t.feed(&mut model, &speech).unwrap();
let silence2 = make_silence(480, 5);
let results = t.feed(&mut model, &silence2).unwrap();
assert_eq!(results.len(), 1);
let segs = results[0].segments.as_ref().unwrap();
assert!(
segs[0].start < 0.01,
"expected start near 0.0, got {}",
segs[0].start
);
}
#[test]
fn vad_chunked_pending_frame_alignment() {
let vad = EnergyVad::new(480, 0.01);
let config = VadChunkedConfig {
min_chunk_secs: 0.0,
..Default::default()
};
let mut t = VadChunked::new(Box::new(vad), config, TranscribeOptions::default());
let mut model = MockModel;
let speech = make_speech(1, 1000); t.feed(&mut model, &speech).unwrap();
t.feed(&mut model, &speech).unwrap();
let silence = make_silence(480, 5);
let results = t.feed(&mut model, &silence).unwrap();
assert_eq!(results.len(), 1);
assert_eq!(results[0].text, "chunk_2400");
}
#[test]
fn vad_chunked_reusable_after_error() {
let vad = EnergyVad::new(480, 0.01);
let config = VadChunkedConfig {
min_chunk_secs: 0.0,
..Default::default()
};
let mut t = VadChunked::new(Box::new(vad), config, TranscribeOptions::default());
let speech = make_speech(480, 10);
t.feed(&mut FailOnNthModel::new(1), &speech).unwrap();
assert!(t.finish(&mut FailOnNthModel::new(1)).is_err());
let mut model = MockModel;
let result = t.transcribe(&mut model, &speech).unwrap();
assert_eq!(result.text, "chunk_4800");
}
}