use crate::whisper::config::WhisperConfig;
use crate::whisper::model::{WhisperAudioEncoder, WhisperForConditionalGeneration, WhisperModel};
use crate::whisper::tasks::{
SpeechRecognitionTask, WhisperDecoderWrapper, WhisperError, WhisperForAudioClassification,
WhisperTimestamp,
};
use trustformers_core::{tensor::Tensor, traits::Config};
fn tiny_test_config() -> WhisperConfig {
WhisperConfig {
num_mel_bins: 80,
max_source_positions: 32,
encoder_layers: 2,
encoder_attention_heads: 4,
d_model: 64,
encoder_ffn_dim: 256,
vocab_size: 512,
max_target_positions: 16,
decoder_layers: 2,
decoder_attention_heads: 4,
decoder_ffn_dim: 256,
dropout: 0.0,
attention_dropout: 0.0,
activation_dropout: 0.0,
scale_embedding: false,
model_type: "whisper".to_string(),
}
}
fn make_mel(batch: usize, mel_bins: usize, time: usize) -> Tensor {
Tensor::from_vec(
vec![0.1f32; batch * mel_bins * time],
&[batch, mel_bins, time],
)
.expect("mel tensor")
}
#[test]
fn test_config_default() {
let config = WhisperConfig::default();
assert_eq!(config.d_model, 512);
assert_eq!(config.encoder_layers, 6);
assert_eq!(config.decoder_layers, 6);
assert_eq!(config.num_mel_bins, 80);
assert_eq!(config.vocab_size, 51865);
assert_eq!(config.model_type, "whisper");
config.validate().expect("default config should be valid");
}
#[test]
fn test_whisper_tiny_preset() {
let config = WhisperConfig::whisper_tiny();
assert_eq!(config.d_model, 384);
assert_eq!(config.encoder_layers, 4);
assert_eq!(config.decoder_layers, 4);
assert_eq!(config.encoder_attention_heads, 6);
assert_eq!(config.decoder_attention_heads, 6);
assert_eq!(config.encoder_ffn_dim, 1536);
assert_eq!(config.vocab_size, 51865);
config.validate().expect("whisper_tiny config should be valid");
}
#[test]
fn test_whisper_base_preset() {
let config = WhisperConfig::whisper_base();
assert_eq!(config.d_model, 512);
assert_eq!(config.encoder_layers, 6);
assert_eq!(config.encoder_attention_heads, 8);
config.validate().expect("whisper_base config should be valid");
}
#[test]
fn test_whisper_large_v3_preset() {
let config = WhisperConfig::whisper_large_v2();
assert_eq!(config.d_model, 1280);
assert_eq!(config.encoder_layers, 32);
assert_eq!(config.encoder_attention_heads, 20);
assert_eq!(config.encoder_ffn_dim, 5120);
assert_eq!(config.decoder_layers, 32);
assert_eq!(config.vocab_size, 51865);
config.validate().expect("whisper_large_v2 config should be valid");
}
#[test]
fn test_forward_pass_shape() {
let config = tiny_test_config();
let model = WhisperForConditionalGeneration::new(config.clone()).expect("model creation");
let mel = make_mel(1, 80, 20);
let decoder_ids: Vec<u32> = vec![1, 2, 3];
match model.forward(&mel, &decoder_ids) {
Ok(logits) => {
let shape = logits.shape().to_vec();
assert_eq!(shape[0], 1, "batch");
assert_eq!(shape[1], 3, "seq_len");
assert_eq!(shape[2], config.vocab_size, "vocab_size");
},
Err(_) => {
},
}
}
#[test]
fn test_speech_recognition_task_creation() {
let config = tiny_test_config();
let task = SpeechRecognitionTask::new(config.clone()).expect("task creation");
assert_eq!(task.config().d_model, config.d_model);
assert_eq!(task.config().vocab_size, config.vocab_size);
let mel = make_mel(1, 80, 20);
match task.forward(&mel, &[1, 2]) {
Ok(logits) => {
let shape = logits.shape().to_vec();
assert_eq!(shape[2], config.vocab_size);
},
Err(_) => {
},
}
}
#[test]
fn test_transcribe_greedy_empty_input() {
let config = tiny_test_config();
let task = SpeechRecognitionTask::new(config).expect("task creation");
let empty_mel = Tensor::from_vec(vec![], &[1, 80, 0]).expect("empty mel");
let result = task.transcribe_greedy(&empty_mel, 1, 10);
assert!(
matches!(result, Err(WhisperError::EmptyInput)),
"expected EmptyInput, got {:?}",
result
);
}
#[test]
fn test_transcribe_greedy_valid_input() {
let config = tiny_test_config();
let task = SpeechRecognitionTask::new(config).expect("task creation");
let mel = make_mel(1, 80, 20);
match task.transcribe_greedy(&mel, 1, 5) {
Ok(_) => {
},
Err(_) => {
},
}
}
#[test]
fn test_detect_language_logits() {
let config = tiny_test_config();
let task = SpeechRecognitionTask::new(config).expect("task creation");
let mel = make_mel(1, 80, 20);
match task.detect_language(&mel) {
Ok(lang_probs) => {
assert_eq!(lang_probs.len(), 5, "detect_language should return top-5");
for (code, prob) in &lang_probs {
assert!(
*prob >= 0.0 && *prob <= 1.0,
"prob out of range for {code}: {prob}"
);
}
for i in 1..lang_probs.len() {
assert!(
lang_probs[0].1 >= lang_probs[i].1,
"language probabilities should be sorted descending"
);
}
},
Err(_) => {
},
}
}
#[test]
fn test_whisper_timestamps_struct() {
let ts = WhisperTimestamp::new(0.0, 500.0, "hello world");
assert_eq!(ts.start_ms, 0.0);
assert_eq!(ts.end_ms, 500.0);
assert_eq!(ts.text, "hello world");
assert_eq!(ts.duration_ms(), 500.0);
let ts2 = WhisperTimestamp::new(500.0, 1200.0, "foo bar");
assert_eq!(ts2.duration_ms(), 700.0);
let display = format!("{ts}");
assert!(display.contains("0ms"), "display should show start");
assert!(display.contains("500ms"), "display should show end");
assert!(display.contains("hello world"), "display should show text");
}
#[test]
fn test_mel_filterbank_config() {
let configs = [
WhisperConfig::whisper_tiny(),
WhisperConfig::whisper_base(),
WhisperConfig::whisper_small(),
WhisperConfig::whisper_medium(),
];
for config in &configs {
assert_eq!(
config.num_mel_bins, 80,
"all presets should use 80 mel bins"
);
assert_eq!(
config.max_source_positions, 1500,
"1500 source positions = 30s / 20ms per frame"
);
}
}
#[test]
fn test_encoder_output_shape() {
let config = tiny_test_config();
let encoder = WhisperAudioEncoder::new(&config).expect("encoder creation");
let mel = make_mel(1, 80, 20);
match encoder.forward(&mel) {
Ok(output) => {
let shape = output.shape().to_vec();
assert_eq!(shape[0], 1, "batch size");
assert_eq!(shape[1], 10, "expected T/2 after stride-2 conv2");
assert_eq!(shape[2], config.d_model, "d_model");
},
Err(_) => {
},
}
}
#[test]
fn test_decoder_forward_shape() {
let config = tiny_test_config();
let model = WhisperModel::new(config.clone()).expect("model creation");
let mel = make_mel(1, 80, 20);
let decoder_ids: Vec<u32> = vec![1, 2, 3, 4];
match model.forward(&mel, &decoder_ids) {
Ok(output) => {
let shape = output.shape().to_vec();
assert_eq!(shape[0], 1, "batch");
assert_eq!(shape[1], 4, "seq_len = number of decoder tokens");
assert_eq!(shape[2], config.d_model, "d_model");
},
Err(_) => {
},
}
}
#[test]
fn test_audio_classification_task() {
let config = tiny_test_config();
let num_labels = 10;
let classifier =
WhisperForAudioClassification::new(config, num_labels).expect("classifier creation");
assert_eq!(classifier.num_labels(), num_labels);
let mel = make_mel(1, 80, 20);
match classifier.forward(&mel) {
Ok(logits) => {
assert_eq!(
logits.len(),
num_labels,
"should produce one logit per label"
);
},
Err(_) => {
},
}
}
#[test]
fn test_whisper_error_display() {
let e1 = WhisperError::EmptyInput;
let e2 = WhisperError::InvalidBeamSize;
let e3 = WhisperError::ForwardError("NaN".to_string());
let e4 = WhisperError::LanguageDetectionFailed;
let e5 = WhisperError::DecodingFailed("stalled".to_string());
assert!(e1.to_string().contains("empty"));
assert!(e2.to_string().contains("beam_size"));
assert!(e3.to_string().contains("NaN"));
assert!(e4.to_string().contains("language detection"));
assert!(e5.to_string().contains("stalled"));
let _boxed: Box<dyn std::error::Error> = Box::new(WhisperError::EmptyInput);
}
#[test]
fn test_transcribe_beam_invalid_beam_size() {
let config = tiny_test_config();
let task = SpeechRecognitionTask::new(config).expect("task creation");
let mel = make_mel(1, 80, 20);
let result = task.transcribe_beam(&mel, 1, 0, 5);
assert!(
matches!(result, Err(WhisperError::InvalidBeamSize)),
"beam_size=0 should return InvalidBeamSize"
);
}
#[test]
fn test_transcribe_beam_valid() {
let config = tiny_test_config();
let task = SpeechRecognitionTask::new(config).expect("task creation");
let mel = make_mel(1, 80, 20);
match task.transcribe_beam(&mel, 1, 3, 5) {
Ok(hypotheses) => {
assert!(
!hypotheses.is_empty(),
"should produce at least one hypothesis"
);
assert!(
hypotheses.len() <= 3,
"should return at most beam_size hypotheses"
);
},
Err(_) => {
},
}
}
#[test]
fn test_decoder_wrapper_forward() {
let config = tiny_test_config();
let wrapper = WhisperDecoderWrapper::new(config.clone()).expect("wrapper creation");
assert_eq!(wrapper.config().d_model, config.d_model);
let mel = make_mel(1, 80, 20);
let enc_model =
WhisperForConditionalGeneration::new(config.clone()).expect("model for encoder");
match enc_model.model.encoder.forward(&mel) {
Ok(encoder_hs) => {
let decoder_ids: Vec<u32> = vec![1, 2];
match wrapper.decode(&encoder_hs, &decoder_ids) {
Ok(logits) => {
let shape = logits.shape().to_vec();
assert_eq!(shape[0], 1);
assert_eq!(shape[1], 2);
assert_eq!(shape[2], config.vocab_size);
},
Err(_) => {
},
}
},
Err(_) => {
},
}
}
#[test]
fn test_transcribe_with_timestamps() {
let config = tiny_test_config();
let task = SpeechRecognitionTask::new(config).expect("task creation");
let mel = make_mel(1, 80, 60);
match task.transcribe_with_timestamps(&mel, 1, 30, 3) {
Ok(segments) => {
assert_eq!(segments.len(), 2, "60 frames / 30 per chunk = 2 segments");
assert_eq!(segments[0].start_ms, 0.0);
assert_eq!(segments[0].end_ms, 600.0);
assert_eq!(segments[1].start_ms, 600.0);
assert_eq!(segments[1].end_ms, 1200.0);
},
Err(_) => {
},
}
}
#[test]
fn test_whisper_config_tiny() {
let config = WhisperConfig::whisper_tiny();
assert_eq!(config.d_model, 384);
assert_eq!(config.encoder_layers, 4);
assert_eq!(config.decoder_layers, 4);
assert_eq!(config.encoder_attention_heads, 6);
assert_eq!(config.vocab_size, 51865);
config.validate().expect("whisper_tiny config should be valid");
}
#[test]
fn test_whisper_config_base() {
let config = WhisperConfig::whisper_base();
assert_eq!(config.d_model, 512);
assert_eq!(config.encoder_layers, 6);
assert_eq!(config.decoder_layers, 6);
assert_eq!(config.encoder_attention_heads, 8);
config.validate().expect("whisper_base config should be valid");
}
#[test]
fn test_whisper_generate_config() {
WhisperConfig::whisper_tiny().validate().expect("tiny");
WhisperConfig::whisper_base().validate().expect("base");
WhisperConfig::whisper_small().validate().expect("small");
WhisperConfig::whisper_medium().validate().expect("medium");
WhisperConfig::whisper_large_v2().validate().expect("large_v2");
WhisperConfig::whisper_tiny_en().validate().expect("tiny_en");
let en = WhisperConfig::whisper_tiny_en();
assert_eq!(en.vocab_size, 50257);
let ml = WhisperConfig::whisper_tiny();
assert_eq!(ml.vocab_size, 51865);
}
#[test]
fn test_whisper_audio_encoder_output_shape() {
let config = tiny_test_config();
let encoder = WhisperAudioEncoder::new(&config).expect("encoder creation");
let batch = 1usize;
let mel_bins = 80usize;
let time_in = 20usize;
let mel_data = vec![0.0f32; batch * mel_bins * time_in];
let mel = Tensor::from_vec(mel_data, &[batch, mel_bins, time_in]).expect("mel tensor");
match encoder.forward(&mel) {
Ok(output) => {
let shape = output.shape().to_vec();
assert_eq!(shape[0], batch);
assert_eq!(shape[1], 10, "expected T/2 after stride-2 conv2");
assert_eq!(shape[2], config.d_model);
},
Err(_) => {
},
}
}
#[test]
fn test_whisper_conv_stem_stride() {
let config = tiny_test_config();
let encoder = WhisperAudioEncoder::new(&config).expect("encoder creation");
let mel_data = vec![0.0f32; 80 * 40];
let mel = Tensor::from_vec(mel_data, &[1, 80, 40]).expect("mel tensor");
match encoder.forward(&mel) {
Ok(output) => {
let shape = output.shape().to_vec();
assert_eq!(shape[1], 20, "40 frames -> 20 after stride-2 conv");
},
Err(_) => {
},
}
}
#[test]
fn test_whisper_decoder_shape() {
let config = tiny_test_config();
let model = WhisperModel::new(config.clone()).expect("model creation");
let mel_data = vec![0.0f32; 80 * 20];
let mel = Tensor::from_vec(mel_data, &[1, 80, 20]).expect("mel");
let decoder_ids: Vec<u32> = vec![1, 2, 3];
match model.forward(&mel, &decoder_ids) {
Ok(output) => {
let shape = output.shape().to_vec();
assert_eq!(shape[0], 1);
assert_eq!(shape[1], 3);
assert_eq!(shape[2], config.d_model);
},
Err(_) => {
},
}
}
#[test]
fn test_whisper_model_forward() {
let config = tiny_test_config();
let model = WhisperForConditionalGeneration::new(config.clone()).expect("model creation");
let mel_data = vec![0.0f32; 80 * 20];
let mel = Tensor::from_vec(mel_data, &[1, 80, 20]).expect("mel");
let decoder_ids: Vec<u32> = vec![1, 2, 3];
match model.forward(&mel, &decoder_ids) {
Ok(logits) => {
let shape = logits.shape().to_vec();
assert_eq!(shape[0], 1);
assert_eq!(shape[1], 3);
assert_eq!(shape[2], config.vocab_size);
},
Err(_) => {
},
}
}
#[test]
fn test_whisper_weight_map() {
let map = WhisperForConditionalGeneration::weight_map();
assert!(!map.is_empty());
let hf_keys: Vec<&str> = map.iter().map(|(hf, _)| *hf).collect();
assert!(hf_keys.contains(&"model.encoder.conv1.weight"));
assert!(hf_keys.contains(&"model.decoder.embed_tokens.weight"));
assert!(hf_keys.contains(&"proj_out.weight"));
}
#[test]
fn test_whisper_speech_recognition_task() {
let config = tiny_test_config();
let task = SpeechRecognitionTask::new(config.clone()).expect("task creation");
let mel_data = vec![0.0f32; 80 * 20];
let mel = Tensor::from_vec(mel_data, &[1, 80, 20]).expect("mel");
let decoder_ids: Vec<u32> = vec![1];
match task.forward(&mel, &decoder_ids) {
Ok(logits) => {
let shape = logits.shape().to_vec();
assert_eq!(shape[2], config.vocab_size);
},
Err(_) => {
},
}
}