#![allow(clippy::disallowed_methods)]
use aprender::speech::vad::{Vad, VadConfig};
fn main() {
println!("=== Whisper Transcription Demo ===\n");
println!("Audio Pipeline Configuration:");
println!(" Sample rate: 16000 Hz");
println!(" Mel bins: 80");
println!(" FFT window: 400 samples (25ms)");
println!(" Hop length: 160 samples (10ms)");
println!(" Mel range: 0-8000 Hz");
println!();
println!("=== Simulated Audio Processing ===\n");
let sample_rate = 16000_u32;
let duration_secs = 2.0;
let num_samples = (sample_rate as f64 * duration_secs) as usize;
println!("Generating synthetic audio:");
println!(" Duration: {} seconds", duration_secs);
println!(" Samples: {}", num_samples);
let audio: Vec<f32> = (0..num_samples)
.map(|i| {
let t = i as f32 / sample_rate as f32;
let f1 = (2.0 * std::f32::consts::PI * 440.0 * t).sin() * 0.3;
let f2 = (2.0 * std::f32::consts::PI * 880.0 * t).sin() * 0.2;
let f3 = (2.0 * std::f32::consts::PI * 220.0 * t).sin() * 0.1;
f1 + f2 + f3
})
.collect();
println!(
" Peak amplitude: {:.4}",
audio.iter().map(|x| x.abs()).fold(0.0_f32, f32::max)
);
println!();
println!("=== Voice Activity Detection ===\n");
let vad_config = VadConfig {
energy_threshold: 0.01,
min_speech_duration_ms: 250,
min_silence_duration_ms: 300,
frame_size: 512,
hop_size: 256,
};
println!("VAD Configuration:");
println!(" Energy threshold: {}", vad_config.energy_threshold);
println!(
" Min speech duration: {} ms",
vad_config.min_speech_duration_ms
);
println!(
" Min silence duration: {} ms",
vad_config.min_silence_duration_ms
);
println!(" Frame size: {} samples", vad_config.frame_size);
println!(" Hop size: {} samples", vad_config.hop_size);
println!();
match Vad::new(vad_config) {
Ok(vad) => match vad.detect(&audio, sample_rate) {
Ok(segments) => {
println!("Detected speech segments:");
if segments.is_empty() {
println!(" No speech detected (synthetic audio may not trigger VAD)");
} else {
for (i, segment) in segments.iter().enumerate() {
println!(
" Segment {}: {:.2}s - {:.2}s (energy: {:.4})",
i + 1,
segment.start,
segment.end,
segment.energy,
);
}
}
}
Err(e) => {
println!(" VAD detection error: {}", e);
}
},
Err(e) => {
println!(" VAD configuration error: {}", e);
}
}
println!();
println!("=== Mel Spectrogram Extraction ===\n");
let window_size = 400;
let hop_length = 160;
let num_frames = (num_samples - window_size) / hop_length + 1;
println!("Mel spectrogram dimensions:");
println!(" Input samples: {}", num_samples);
println!(" Window size: {} samples", window_size);
println!(" Hop length: {} samples", hop_length);
println!(" Output frames: {}", num_frames);
println!(" Mel bins: 80");
println!(" Output shape: [80, {}]", num_frames);
println!();
let energy: f32 = audio.iter().map(|x| x * x).sum::<f32>() / audio.len() as f32;
let rms = energy.sqrt();
println!("Audio Statistics:");
println!(" RMS energy: {:.6}", rms);
println!(
" Duration: {:.2}s",
audio.len() as f64 / sample_rate as f64
);
println!();
println!("=== Whisper Model Information ===\n");
println!("Supported Whisper Models:");
println!(" tiny - 39M params, ~75 MB");
println!(" base - 74M params, ~142 MB");
println!(" small - 244M params, ~466 MB");
println!(" medium - 769M params, ~1.5 GB");
println!(" large - 1.5B params, ~2.9 GB");
println!();
println!("Whisper Features:");
println!(" - Multilingual speech recognition (99 languages)");
println!(" - Speech translation");
println!(" - Language detection");
println!(" - Timestamp prediction");
println!(" - Voice activity detection");
println!();
println!("=== Simulated Transcription ===\n");
println!("Input: [2.0 seconds of audio]");
println!("Output: \"This is a simulated transcription result.\"");
println!();
println!("(Actual transcription requires Whisper model weights)");
println!("(Use: apr import hf://openai/whisper-tiny -o whisper.apr)");
println!("\n=== Pipeline Summary ===\n");
println!("Audio → Resample → Mel Spectrogram → Encoder → Decoder → Text");
println!();
println!("Processing times (estimated for whisper-tiny):");
println!(" Mel extraction: ~10ms");
println!(" Encoder: ~50ms");
println!(" Decoder: ~100ms per token");
println!(" Total (2s audio): ~200-500ms");
println!("\n=== Demo Complete ===");
println!("Audio pipeline ready for Whisper integration!");
}