aprender-core 0.31.2

Next-generation machine learning library in pure Rust
#![allow(clippy::disallowed_methods)]
//! Whisper Transcription Demo (Section O2)
//!
//! Demonstrates the audio processing pipeline for speech recognition.
//! Shows mel spectrogram extraction and VAD (Voice Activity Detection).
//!
//! # Usage
//! ```bash
//! cargo run --example whisper_transcribe
//! ```

use aprender::speech::vad::{Vad, VadConfig};

fn main() {
    println!("=== Whisper Transcription Demo ===\n");

    // Audio configuration
    println!("Audio Pipeline Configuration:");
    println!("  Sample rate: 16000 Hz");
    println!("  Mel bins: 80");
    println!("  FFT window: 400 samples (25ms)");
    println!("  Hop length: 160 samples (10ms)");
    println!("  Mel range: 0-8000 Hz");
    println!();

    // Simulate audio processing
    println!("=== Simulated Audio Processing ===\n");

    // Generate synthetic audio (sine wave at 440 Hz - A4 note)
    let sample_rate = 16000_u32;
    let duration_secs = 2.0;
    let num_samples = (sample_rate as f64 * duration_secs) as usize;

    println!("Generating synthetic audio:");
    println!("  Duration: {} seconds", duration_secs);
    println!("  Samples: {}", num_samples);

    let audio: Vec<f32> = (0..num_samples)
        .map(|i| {
            let t = i as f32 / sample_rate as f32;
            // Mix of frequencies to simulate speech-like content
            let f1 = (2.0 * std::f32::consts::PI * 440.0 * t).sin() * 0.3;
            let f2 = (2.0 * std::f32::consts::PI * 880.0 * t).sin() * 0.2;
            let f3 = (2.0 * std::f32::consts::PI * 220.0 * t).sin() * 0.1;
            f1 + f2 + f3
        })
        .collect();

    println!(
        "  Peak amplitude: {:.4}",
        audio.iter().map(|x| x.abs()).fold(0.0_f32, f32::max)
    );
    println!();

    // Voice Activity Detection
    println!("=== Voice Activity Detection ===\n");

    let vad_config = VadConfig {
        energy_threshold: 0.01,
        min_speech_duration_ms: 250,
        min_silence_duration_ms: 300,
        frame_size: 512,
        hop_size: 256,
    };

    println!("VAD Configuration:");
    println!("  Energy threshold: {}", vad_config.energy_threshold);
    println!(
        "  Min speech duration: {} ms",
        vad_config.min_speech_duration_ms
    );
    println!(
        "  Min silence duration: {} ms",
        vad_config.min_silence_duration_ms
    );
    println!("  Frame size: {} samples", vad_config.frame_size);
    println!("  Hop size: {} samples", vad_config.hop_size);
    println!();

    match Vad::new(vad_config) {
        Ok(vad) => match vad.detect(&audio, sample_rate) {
            Ok(segments) => {
                println!("Detected speech segments:");
                if segments.is_empty() {
                    println!("  No speech detected (synthetic audio may not trigger VAD)");
                } else {
                    for (i, segment) in segments.iter().enumerate() {
                        println!(
                            "  Segment {}: {:.2}s - {:.2}s (energy: {:.4})",
                            i + 1,
                            segment.start,
                            segment.end,
                            segment.energy,
                        );
                    }
                }
            }
            Err(e) => {
                println!("  VAD detection error: {}", e);
            }
        },
        Err(e) => {
            println!("  VAD configuration error: {}", e);
        }
    }
    println!();

    // Mel Spectrogram (conceptual)
    println!("=== Mel Spectrogram Extraction ===\n");

    let window_size = 400;
    let hop_length = 160;
    let num_frames = (num_samples - window_size) / hop_length + 1;

    println!("Mel spectrogram dimensions:");
    println!("  Input samples: {}", num_samples);
    println!("  Window size: {} samples", window_size);
    println!("  Hop length: {} samples", hop_length);
    println!("  Output frames: {}", num_frames);
    println!("  Mel bins: 80");
    println!("  Output shape: [80, {}]", num_frames);
    println!();

    // Compute basic statistics (simulating mel computation)
    let energy: f32 = audio.iter().map(|x| x * x).sum::<f32>() / audio.len() as f32;
    let rms = energy.sqrt();

    println!("Audio Statistics:");
    println!("  RMS energy: {:.6}", rms);
    println!(
        "  Duration: {:.2}s",
        audio.len() as f64 / sample_rate as f64
    );
    println!();

    // Whisper model info
    println!("=== Whisper Model Information ===\n");

    println!("Supported Whisper Models:");
    println!("  tiny    -  39M params, ~75 MB");
    println!("  base    -  74M params, ~142 MB");
    println!("  small   - 244M params, ~466 MB");
    println!("  medium  - 769M params, ~1.5 GB");
    println!("  large   - 1.5B params, ~2.9 GB");
    println!();

    println!("Whisper Features:");
    println!("  - Multilingual speech recognition (99 languages)");
    println!("  - Speech translation");
    println!("  - Language detection");
    println!("  - Timestamp prediction");
    println!("  - Voice activity detection");
    println!();

    // Simulated transcription output
    println!("=== Simulated Transcription ===\n");
    println!("Input: [2.0 seconds of audio]");
    println!("Output: \"This is a simulated transcription result.\"");
    println!();
    println!("(Actual transcription requires Whisper model weights)");
    println!("(Use: apr import hf://openai/whisper-tiny -o whisper.apr)");

    // Pipeline summary
    println!("\n=== Pipeline Summary ===\n");
    println!("Audio → Resample → Mel Spectrogram → Encoder → Decoder → Text");
    println!();
    println!("Processing times (estimated for whisper-tiny):");
    println!("  Mel extraction: ~10ms");
    println!("  Encoder: ~50ms");
    println!("  Decoder: ~100ms per token");
    println!("  Total (2s audio): ~200-500ms");

    println!("\n=== Demo Complete ===");
    println!("Audio pipeline ready for Whisper integration!");
}