#![cfg(feature = "stt-candle")]
use std::sync::OnceLock;
use candle_transformers::models::whisper as m;
use super::SttError;
pub(crate) const MAX_SAMPLES: usize = m::N_SAMPLES;
const MEL_FILTERS_80_BYTES: &[u8] = include_bytes!("melfilters.bytes");
const NUM_MEL_BINS_DEFAULT: usize = 80;
fn mel_filters_80() -> &'static [f32] {
static CACHE: OnceLock<Vec<f32>> = OnceLock::new();
CACHE.get_or_init(|| {
let mut out = vec![0f32; MEL_FILTERS_80_BYTES.len() / 4];
for (i, chunk) in MEL_FILTERS_80_BYTES.chunks_exact(4).enumerate() {
out[i] = f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
}
out
})
}
pub(crate) fn compute_log_mel_spectrogram(
samples: &[f32],
num_mel_bins: usize,
) -> Result<Vec<f32>, SttError> {
if samples.is_empty() {
return Err(SttError::EmptyAudio);
}
if samples.len() > MAX_SAMPLES {
return Err(SttError::Decode(format!(
"audio length {} samples exceeds Whisper's 30-second window ({} samples \
@ 16 kHz) — split the clip before transcribing or wait for the \
long-form Phase 92.x follow-up",
samples.len(),
MAX_SAMPLES,
)));
}
if num_mel_bins != NUM_MEL_BINS_DEFAULT {
return Err(SttError::Decode(format!(
"Phase 91 v1 only ships the 80-bin mel filterbank; the model \
requested {num_mel_bins} bins. Use a `tiny`/`base`/`small` Whisper \
checkpoint or wait for the 128-bin asset (Phase 91 follow-up)"
)));
}
let cfg = m::Config {
num_mel_bins,
..whisper_tiny_config_stub()
};
Ok(m::audio::pcm_to_mel(&cfg, samples, mel_filters_80()))
}
fn whisper_tiny_config_stub() -> m::Config {
m::Config {
num_mel_bins: NUM_MEL_BINS_DEFAULT,
max_source_positions: 1_500,
d_model: 384,
encoder_attention_heads: 6,
encoder_layers: 4,
vocab_size: 51_865,
max_target_positions: 448,
decoder_attention_heads: 6,
decoder_layers: 4,
suppress_tokens: vec![],
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_input_returns_empty_audio() {
let result = compute_log_mel_spectrogram(&[], 80);
assert!(matches!(result, Err(SttError::EmptyAudio)));
}
#[test]
fn over_30s_clip_rejects_with_decode_error() {
let samples = vec![0f32; MAX_SAMPLES + 1];
let err = compute_log_mel_spectrogram(&samples, 80).unwrap_err();
let msg = err.to_string();
assert!(msg.contains("30-second window"), "got: {msg}");
assert!(msg.contains("split"), "got: {msg}");
}
#[test]
fn non_80_bin_request_rejects() {
let samples = vec![0f32; 16_000]; let err = compute_log_mel_spectrogram(&samples, 128).unwrap_err();
let msg = err.to_string();
assert!(msg.contains("80-bin"), "got: {msg}");
assert!(msg.contains("128"), "got: {msg}");
}
#[test]
fn short_silence_produces_a_buffer() {
let samples = vec![0f32; 16_000];
let mel = compute_log_mel_spectrogram(&samples, 80).expect("1s silence is valid input");
assert!(!mel.is_empty(), "mel buffer must be non-empty");
assert_eq!(mel.len() % 80, 0, "mel length must align to 80 bins");
}
#[test]
fn mel_filters_decode_to_expected_count() {
let filters = mel_filters_80();
assert_eq!(filters.len(), 80 * 201, "filterbank shape must be 80×201");
assert!(filters[0].is_finite());
}
}