use anyhow::{Context, Result};
use rubato::{FftFixedIn, Resampler};
pub const WHISPER_SAMPLE_RATE: u32 = 16000;
pub struct FrameResampler {
resampler: Option<FftFixedIn<f32>>,
channels: u16,
input_buffer: Vec<f32>,
chunk_size: usize,
}
impl FrameResampler {
pub fn new(source_rate: u32, channels: u16) -> Result<Self> {
if source_rate == WHISPER_SAMPLE_RATE && channels == 1 {
return Ok(Self {
resampler: None,
channels,
input_buffer: Vec::new(),
chunk_size: 0,
});
}
let resampler = FftFixedIn::<f32>::new(
source_rate as usize,
WHISPER_SAMPLE_RATE as usize,
1024, 2, 1, )
.context("Failed to create frame resampler")?;
let chunk_size = resampler.input_frames_max();
Ok(Self {
resampler: Some(resampler),
channels,
input_buffer: Vec::with_capacity(chunk_size * 2),
chunk_size,
})
}
pub fn process(&mut self, samples: &[f32]) -> Vec<f32> {
let Some(resampler) = &mut self.resampler else {
return samples.to_vec();
};
let mono_samples = if self.channels > 1 {
stereo_to_mono(samples, self.channels)
} else {
samples.to_vec()
};
self.input_buffer.extend_from_slice(&mono_samples);
let mut output = Vec::new();
while self.input_buffer.len() >= self.chunk_size {
let chunk: Vec<f32> = self.input_buffer.drain(..self.chunk_size).collect();
if let Ok(resampled) = resampler.process(&[chunk], None) {
output.extend_from_slice(&resampled[0]);
}
}
output
}
pub fn flush(&mut self) -> Vec<f32> {
let Some(resampler) = &mut self.resampler else {
return std::mem::take(&mut self.input_buffer);
};
if self.input_buffer.is_empty() {
return Vec::new();
}
let mut padded = std::mem::take(&mut self.input_buffer);
padded.resize(self.chunk_size, 0.0);
if let Ok(resampled) = resampler.process(&[padded], None) {
resampled[0].clone()
} else {
Vec::new()
}
}
}
#[cfg(feature = "local-transcription")]
pub fn resample_to_16k(samples: &[f32], source_rate: u32, channels: u16) -> Result<Vec<f32>> {
let mono_samples = if channels > 1 {
stereo_to_mono(samples, channels)
} else {
samples.to_vec()
};
if source_rate == WHISPER_SAMPLE_RATE {
return Ok(mono_samples);
}
let mut resampler = FftFixedIn::<f32>::new(
source_rate as usize,
WHISPER_SAMPLE_RATE as usize,
1024, 2, 1, )
.context("Failed to create resampler")?;
let mut output = Vec::new();
let chunk_size = resampler.input_frames_max();
for chunk in mono_samples.chunks(chunk_size) {
let mut padded = chunk.to_vec();
if padded.len() < chunk_size {
padded.resize(chunk_size, 0.0);
}
let result = resampler
.process(&[padded], None)
.context("Resampling failed")?;
output.extend_from_slice(&result[0]);
}
Ok(output)
}
fn stereo_to_mono(samples: &[f32], channels: u16) -> Vec<f32> {
samples
.chunks(channels as usize)
.map(|frame| frame.iter().sum::<f32>() / channels as f32)
.collect()
}