use crate::{SpeechModel, TranscribeError, TranscribeOptions, TranscriptionResult};
use super::merge::merge_sequential_with_separator;
use super::{rms_energy, transcribe_padded, Transcriber, SAMPLE_RATE};
pub struct EnergyAdaptiveConfig {
pub target_chunk_secs: f32,
pub search_window_secs: f32,
pub padding_secs: f32,
pub min_chunk_secs: f32,
pub frame_size: usize,
pub merge_separator: String,
}
impl Default for EnergyAdaptiveConfig {
fn default() -> Self {
Self {
target_chunk_secs: 30.0,
search_window_secs: 3.0,
padding_secs: 0.0,
min_chunk_secs: 0.0,
frame_size: 480,
merge_separator: " ".into(),
}
}
}
pub struct EnergyAdaptiveChunked {
config: EnergyAdaptiveConfig,
options: TranscribeOptions,
buffer: Vec<f32>,
elapsed_samples: usize,
chunk_index: usize,
results: Vec<TranscriptionResult>,
}
impl EnergyAdaptiveChunked {
pub fn new(config: EnergyAdaptiveConfig, options: TranscribeOptions) -> Self {
Self {
config,
options,
buffer: Vec::new(),
elapsed_samples: 0,
chunk_index: 0,
results: Vec::new(),
}
}
fn find_split_point(&self, target_samples: usize) -> usize {
let search_samples = (self.config.search_window_secs * SAMPLE_RATE) as usize;
let search_start = target_samples.saturating_sub(search_samples);
let search_end = (target_samples + search_samples).min(self.buffer.len());
let search_start = (search_start / self.config.frame_size) * self.config.frame_size;
let mut min_rms = f32::MAX;
let mut best_offset = target_samples.min(self.buffer.len());
let mut offset = search_start;
while offset + self.config.frame_size <= search_end {
let frame = &self.buffer[offset..offset + self.config.frame_size];
let rms = rms_energy(frame);
if rms < min_rms {
min_rms = rms;
best_offset = offset + self.config.frame_size;
}
offset += self.config.frame_size;
}
log::debug!(
"energy adaptive: target={:.2}s, split at {:.2}s (rms={:.4})",
target_samples as f32 / SAMPLE_RATE,
best_offset as f32 / SAMPLE_RATE,
min_rms,
);
best_offset
}
fn transcribe_chunk(
&mut self,
model: &mut dyn SpeechModel,
chunk: &[f32],
chunk_start_samples: usize,
) -> Result<TranscriptionResult, TranscribeError> {
let chunk_start_secs = chunk_start_samples as f32 / SAMPLE_RATE;
log::info!(
"chunk {}: start={:.2}s duration={:.2}s samples={} padding={:.0}ms",
self.chunk_index,
chunk_start_secs,
chunk.len() as f32 / SAMPLE_RATE,
chunk.len(),
self.config.padding_secs * 1000.0,
);
self.chunk_index += 1;
let result = transcribe_padded(
model,
chunk,
self.config.padding_secs,
self.config.min_chunk_secs,
chunk_start_secs,
&self.options,
)?;
log::info!(" -> \"{}\"", result.text.trim());
self.results.push(result.clone());
Ok(result)
}
fn finish_inner(
&mut self,
model: &mut dyn SpeechModel,
) -> Result<TranscriptionResult, TranscribeError> {
if !self.buffer.is_empty() {
let chunk = std::mem::take(&mut self.buffer);
let chunk_secs = chunk.len() as f32 / SAMPLE_RATE;
if chunk_secs >= self.config.min_chunk_secs {
let chunk_start = self.elapsed_samples - chunk.len();
self.transcribe_chunk(model, &chunk, chunk_start)?;
} else {
log::debug!(
"skipping short remainder ({:.2}s < min {:.2}s)",
chunk_secs,
self.config.min_chunk_secs
);
}
}
Ok(merge_sequential_with_separator(
&self.results,
&self.config.merge_separator,
))
}
fn reset_state(&mut self) {
self.buffer.clear();
self.results.clear();
self.elapsed_samples = 0;
self.chunk_index = 0;
}
}
impl Transcriber for EnergyAdaptiveChunked {
fn feed(
&mut self,
model: &mut dyn SpeechModel,
samples: &[f32],
) -> Result<Vec<TranscriptionResult>, TranscribeError> {
self.buffer.extend_from_slice(samples);
self.elapsed_samples += samples.len();
let target_samples = (self.config.target_chunk_secs * SAMPLE_RATE) as usize;
let search_samples = (self.config.search_window_secs * SAMPLE_RATE) as usize;
let min_buffer_for_split = target_samples + search_samples;
let mut new_results = Vec::new();
while self.buffer.len() >= min_buffer_for_split {
let split_at = self.find_split_point(target_samples);
let chunk: Vec<f32> = self.buffer.drain(..split_at).collect();
let chunk_start = self.elapsed_samples - self.buffer.len() - chunk.len();
new_results.push(self.transcribe_chunk(model, &chunk, chunk_start)?);
}
Ok(new_results)
}
fn finish(
&mut self,
model: &mut dyn SpeechModel,
) -> Result<TranscriptionResult, TranscribeError> {
let result = self.finish_inner(model);
self.reset_state();
result
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::transcriber::test_helpers::MockModel;
#[test]
fn energy_adaptive_splits_at_low_energy() {
let config = EnergyAdaptiveConfig {
target_chunk_secs: 3.0,
search_window_secs: 0.5,
..Default::default()
};
let mut t = EnergyAdaptiveChunked::new(config, TranscribeOptions::default());
let mut model = MockModel;
let mut audio = vec![1.0f32; 16000 * 5];
let quiet_start = (2.7 * SAMPLE_RATE) as usize;
let quiet_end = (2.8 * SAMPLE_RATE) as usize;
for s in &mut audio[quiet_start..quiet_end] {
*s = 0.01;
}
let results = t.feed(&mut model, &audio).unwrap();
assert_eq!(results.len(), 1);
let chunk_samples: usize = results[0]
.text
.strip_prefix("chunk_")
.unwrap()
.parse()
.unwrap();
let chunk_secs = chunk_samples as f32 / SAMPLE_RATE;
assert!(
chunk_secs >= 2.5 && chunk_secs <= 3.5,
"split at {chunk_secs:.2}s, expected within search window of 3.0s"
);
assert!(
chunk_secs < 3.0,
"split at {chunk_secs:.2}s, should prefer quiet region before target"
);
}
#[test]
fn energy_adaptive_uniform_audio_splits_near_target() {
let config = EnergyAdaptiveConfig {
target_chunk_secs: 1.0,
search_window_secs: 0.3,
..Default::default()
};
let mut t = EnergyAdaptiveChunked::new(config, TranscribeOptions::default());
let mut model = MockModel;
let audio = vec![0.5f32; 16000 * 3];
let results = t.feed(&mut model, &audio).unwrap();
assert!(!results.is_empty());
}
#[test]
fn energy_adaptive_remainder_in_finish() {
let config = EnergyAdaptiveConfig {
target_chunk_secs: 3.0,
search_window_secs: 0.5,
..Default::default()
};
let mut t = EnergyAdaptiveChunked::new(config, TranscribeOptions::default());
let mut model = MockModel;
let audio = vec![0.5f32; 16000 * 2];
let results = t.feed(&mut model, &audio).unwrap();
assert!(results.is_empty());
let final_result = t.finish(&mut model).unwrap();
assert_eq!(final_result.text, "chunk_32000");
}
#[test]
fn energy_adaptive_min_chunk_skips_short_remainder() {
let config = EnergyAdaptiveConfig {
target_chunk_secs: 3.0,
search_window_secs: 0.5,
min_chunk_secs: 3.0, ..Default::default()
};
let mut t = EnergyAdaptiveChunked::new(config, TranscribeOptions::default());
let mut model = MockModel;
let audio = vec![0.5f32; 16000 * 2];
t.feed(&mut model, &audio).unwrap();
let final_result = t.finish(&mut model).unwrap();
assert_eq!(final_result.text, ""); }
#[test]
fn energy_adaptive_timestamps_correct() {
let config = EnergyAdaptiveConfig {
target_chunk_secs: 1.0,
search_window_secs: 0.0,
..Default::default()
};
let mut t = EnergyAdaptiveChunked::new(config, TranscribeOptions::default());
let mut model = MockModel;
let audio = vec![0.5f32; 16000 * 3];
let results = t.feed(&mut model, &audio).unwrap();
assert_eq!(results.len(), 3);
if results.len() >= 2 {
let seg0 = &results[0].segments.as_ref().unwrap()[0];
let seg1 = &results[1].segments.as_ref().unwrap()[0];
assert!((seg0.start - 0.0).abs() < 0.01);
assert!((seg1.start - 1.0).abs() < 0.01);
}
}
#[test]
fn energy_adaptive_timestamps_clamped_to_zero() {
let config = EnergyAdaptiveConfig {
target_chunk_secs: 1.0,
search_window_secs: 0.0,
padding_secs: 0.5,
..Default::default()
};
let mut t = EnergyAdaptiveChunked::new(config, TranscribeOptions::default());
let mut model = MockModel;
let audio = vec![0.5f32; 16000 * 2];
let results = t.feed(&mut model, &audio).unwrap();
assert!(!results.is_empty());
let segs = results[0].segments.as_ref().unwrap();
assert!(
segs[0].start >= 0.0,
"timestamp should not be negative, got {}",
segs[0].start
);
assert!(
segs[0].end >= 0.0,
"timestamp should not be negative, got {}",
segs[0].end
);
}
#[test]
fn energy_adaptive_transcribe_convenience() {
let config = EnergyAdaptiveConfig {
target_chunk_secs: 1.0,
search_window_secs: 0.3,
..Default::default()
};
let mut t = EnergyAdaptiveChunked::new(config, TranscribeOptions::default());
let mut model = MockModel;
let audio = vec![0.5f32; 16000 * 5];
let result = t.transcribe(&mut model, &audio).unwrap();
assert!(!result.text.is_empty());
}
#[test]
fn energy_adaptive_empty_input() {
let config = EnergyAdaptiveConfig {
target_chunk_secs: 1.0,
search_window_secs: 0.3,
..Default::default()
};
let mut t = EnergyAdaptiveChunked::new(config, TranscribeOptions::default());
let mut model = MockModel;
let result = t.transcribe(&mut model, &[]).unwrap();
assert_eq!(result.text, "");
}
#[test]
fn energy_adaptive_object_safe() {
let config = EnergyAdaptiveConfig {
target_chunk_secs: 1.0,
search_window_secs: 0.3,
..Default::default()
};
let mut transcriber: Box<dyn Transcriber> = Box::new(EnergyAdaptiveChunked::new(
config,
TranscribeOptions::default(),
));
let mut model = MockModel;
let audio = vec![0.5f32; 16000 * 5];
let result = transcriber.transcribe(&mut model, &audio).unwrap();
assert!(!result.text.is_empty());
}
#[test]
fn energy_adaptive_reusable_after_error() {
use crate::transcriber::test_helpers::FailOnNthModel;
let config = EnergyAdaptiveConfig {
target_chunk_secs: 3.0,
search_window_secs: 0.5,
..Default::default()
};
let mut t = EnergyAdaptiveChunked::new(config, TranscribeOptions::default());
let audio = vec![0.5f32; 16000 * 2];
t.feed(&mut FailOnNthModel::new(99), &audio).unwrap();
assert!(t.finish(&mut FailOnNthModel::new(1)).is_err());
let mut model = MockModel;
let result = t.transcribe(&mut model, &audio).unwrap();
assert!(!result.text.is_empty());
}
}