whisp 0.5.0 - Docs.rs

//! Local Whisper transcription using whisper-rs.
//!
//! This module provides local transcription using the whisper.cpp library
//! via whisper-rs bindings.

use std::path::PathBuf;
use std::sync::Mutex;

use async_trait::async_trait;
use bytes::Bytes;
use tracing::{debug, info};
use whisper_rs::{
    FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters, WhisperState,
};

use super::model::{WhisperModel, model_path};
#[cfg(target_os = "macos")]
use super::model::{coreml_encoder_exists, coreml_encoder_path, ensure_coreml_encoder};
use super::{Result, TranscribeError, Transcriber};

/// Configuration for the local Whisper transcriber.
#[derive(Debug, Clone)]
pub struct LocalWhisperConfig {
    /// The model to use.
    pub model: WhisperModel,
    /// Optional override path to the model file.
    pub model_path: Option<PathBuf>,
    /// Enable CoreML acceleration on macOS.
    /// When enabled, downloads the CoreML encoder for ~3x faster encoding via Apple Neural Engine.
    pub coreml: bool,
}

impl LocalWhisperConfig {
    /// Create a new config with the specified model.
    pub fn new(model: WhisperModel) -> Self {
        Self {
            model,
            model_path: None,
            coreml: cfg!(target_os = "macos"), // Default to true on macOS
        }
    }

    /// Create a config with a custom model path.
    pub fn with_model_path(mut self, path: PathBuf) -> Self {
        self.model_path = Some(path);
        self
    }

    /// Enable or disable CoreML acceleration (macOS only).
    pub fn with_coreml(mut self, enabled: bool) -> Self {
        self.coreml = enabled;
        self
    }
}

/// Holds the WhisperContext and a reusable WhisperState.
struct WhisperInstance {
    /// Kept alive to ensure state remains valid.
    _context: WhisperContext,
    state: WhisperState,
}

impl WhisperInstance {
    fn new(context: WhisperContext) -> Result<Self> {
        let state = context.create_state().map_err(|e| {
            TranscribeError::TranscriptionFailed(format!("Failed to create state: {}", e))
        })?;
        Ok(Self {
            _context: context,
            state,
        })
    }
}

/// Local Whisper transcriber using whisper.cpp.
pub struct LocalWhisperClient {
    config: LocalWhisperConfig,
    /// Lazily initialized whisper instance (context + reusable state).
    instance: Mutex<Option<WhisperInstance>>,
}

impl LocalWhisperClient {
    /// Create a new local Whisper client.
    pub fn new(config: LocalWhisperConfig) -> Self {
        Self {
            config,
            instance: Mutex::new(None),
        }
    }

    /// Ensure CoreML encoder is available on macOS.
    ///
    /// This downloads the CoreML encoder if it doesn't exist and CoreML is enabled.
    /// Must be called before loading the model since whisper.cpp looks for the
    /// encoder at model load time.
    #[cfg(target_os = "macos")]
    async fn ensure_coreml_setup(&self) -> Result<()> {
        if !self.config.coreml {
            return Ok(());
        }

        // Check if CoreML encoder already exists
        if coreml_encoder_exists(self.config.model)
            .map_err(|e| TranscribeError::TranscriptionFailed(e.to_string()))?
        {
            info!(
                model = ?self.config.model,
                path = ?coreml_encoder_path(self.config.model).ok(),
                "CoreML encoder available"
            );
            return Ok(());
        }

        // Download the CoreML encoder
        info!(
            model = ?self.config.model,
            "Downloading CoreML encoder for faster transcription..."
        );

        ensure_coreml_encoder(self.config.model, |downloaded, total| {
            let percent = (downloaded as f64 / total as f64 * 100.0) as u32;
            if percent % 10 == 0 {
                debug!("CoreML encoder download: {}%", percent);
            }
        })
        .await
        .map_err(|e| {
            TranscribeError::TranscriptionFailed(format!(
                "Failed to download CoreML encoder: {}",
                e
            ))
        })?;

        Ok(())
    }

    /// No-op on non-macOS platforms.
    #[cfg(not(target_os = "macos"))]
    async fn ensure_coreml_setup(&self) -> Result<()> {
        Ok(())
    }

    /// Get or initialize the whisper instance, returning a guard.
    fn ensure_instance(&self) -> Result<std::sync::MutexGuard<'_, Option<WhisperInstance>>> {
        let mut guard = self.instance.lock().map_err(|e| {
            TranscribeError::TranscriptionFailed(format!("Failed to lock instance: {}", e))
        })?;
        if guard.is_none() {
            let path = match &self.config.model_path {
                Some(p) => p.clone(),
                None => model_path(self.config.model)
                    .map_err(|e| TranscribeError::TranscriptionFailed(e.to_string()))?,
            };

            info!(path = ?path, "Loading Whisper model");

            let ctx = WhisperContext::new_with_params(
                path.to_str().ok_or_else(|| {
                    TranscribeError::TranscriptionFailed("Invalid model path".to_string())
                })?,
                WhisperContextParameters::default(),
            )
            .map_err(|e| {
                TranscribeError::TranscriptionFailed(format!("Failed to load model: {}", e))
            })?;

            let instance = WhisperInstance::new(ctx)?;
            info!("Whisper model loaded successfully");
            *guard = Some(instance);
        }
        Ok(guard)
    }

    /// Convert WAV audio data to 16kHz mono f32 samples.
    fn convert_audio(&self, audio: &[u8]) -> Result<Vec<f32>> {
        use std::io::Cursor;

        let cursor = Cursor::new(audio);
        let reader = hound::WavReader::new(cursor).map_err(|e| {
            TranscribeError::InvalidAudioFormat(format!("Failed to read WAV: {}", e))
        })?;

        let spec = reader.spec();
        let sample_rate = spec.sample_rate;
        let channels = spec.channels as usize;

        debug!(
            sample_rate = sample_rate,
            channels = channels,
            bits_per_sample = spec.bits_per_sample,
            "Converting audio"
        );

        // Read samples as f32
        let samples: Vec<f32> = match spec.sample_format {
            hound::SampleFormat::Float => reader
                .into_samples::<f32>()
                .collect::<std::result::Result<Vec<_>, _>>()
                .map_err(|e| {
                    TranscribeError::InvalidAudioFormat(format!("Failed to read samples: {}", e))
                })?,
            hound::SampleFormat::Int => {
                let bits = spec.bits_per_sample;
                let max_val = (1u32 << (bits - 1)) as f32;
                reader
                    .into_samples::<i32>()
                    .collect::<std::result::Result<Vec<_>, _>>()
                    .map_err(|e| {
                        TranscribeError::InvalidAudioFormat(format!(
                            "Failed to read samples: {}",
                            e
                        ))
                    })?
                    .into_iter()
                    .map(|s| s as f32 / max_val)
                    .collect()
            }
        };

        let original_sample_count = samples.len();

        // Convert to mono if stereo
        let mono_samples: Vec<f32> = if channels > 1 {
            samples
                .chunks(channels)
                .map(|chunk| chunk.iter().sum::<f32>() / channels as f32)
                .collect()
        } else {
            samples
        };

        // Resample to 16kHz if needed
        let target_rate = 16000;
        let resampled = if sample_rate != target_rate {
            resample(&mono_samples, sample_rate, target_rate)
        } else {
            mono_samples
        };

        debug!(
            original_samples = original_sample_count,
            resampled_samples = resampled.len(),
            "Audio conversion complete"
        );

        Ok(resampled)
    }
}

/// Linear interpolation resampling.
///
/// Technically this can introduce aliasing artifacts when downsampling without
/// a low-pass filter, but in practice the output sounds identical for speech.
/// Consider using the `rubato` crate for sinc interpolation if quality issues arise.
fn resample(samples: &[f32], from_rate: u32, to_rate: u32) -> Vec<f32> {
    if from_rate == to_rate {
        return samples.to_vec();
    }

    let ratio = from_rate as f64 / to_rate as f64;
    let new_len = (samples.len() as f64 / ratio) as usize;
    let mut result = Vec::with_capacity(new_len);

    for i in 0..new_len {
        let src_idx = i as f64 * ratio;
        let src_idx_floor = src_idx.floor() as usize;
        let frac = src_idx - src_idx_floor as f64;

        let sample = if src_idx_floor + 1 < samples.len() {
            let s0 = samples[src_idx_floor] as f64;
            let s1 = samples[src_idx_floor + 1] as f64;
            (s0 * (1.0 - frac) + s1 * frac) as f32
        } else if src_idx_floor < samples.len() {
            samples[src_idx_floor]
        } else {
            0.0
        };

        result.push(sample);
    }

    result
}

#[async_trait]
impl Transcriber for LocalWhisperClient {
    async fn transcribe(&self, audio: Bytes, language: Option<&str>) -> Result<String> {
        // Ensure CoreML encoder is available (macOS only, downloads if needed)
        self.ensure_coreml_setup().await?;

        // Convert audio to the format whisper expects
        let samples = self.convert_audio(&audio)?;

        // Get the instance (ensures model is loaded)
        let mut guard = self.ensure_instance()?;
        let instance = guard.as_mut().expect("instance should be initialized");

        // Configure transcription parameters
        let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 1 });

        // Set language if provided
        if let Some(lang) = language {
            params.set_language(Some(lang));
        } else {
            // Auto-detect language
            params.set_language(None);
        }

        // Disable printing to stdout
        params.set_print_special(false);
        params.set_print_progress(false);
        params.set_print_realtime(false);
        params.set_print_timestamps(false);

        // Run transcription
        instance.state.full(params, &samples).map_err(|e| {
            TranscribeError::TranscriptionFailed(format!("Transcription failed: {}", e))
        })?;

        // Collect all segments into the result
        let num_segments = instance.state.full_n_segments();

        let mut result = String::new();
        for i in 0..num_segments {
            if let Some(segment) = instance.state.get_segment(i)
                && let Ok(text) = segment.to_str_lossy()
            {
                result.push_str(&text);
            }
        }

        Ok(result.trim().to_string())
    }

    fn name(&self) -> &str {
        "local-whisper"
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_resample() {
        // Simple test: downsampling should produce fewer samples
        let samples: Vec<f32> = (0..48000).map(|i| (i as f32 / 48000.0).sin()).collect();
        let resampled = resample(&samples, 48000, 16000);
        assert_eq!(resampled.len(), 16000);
    }

    #[test]
    fn test_config_new() {
        let config = LocalWhisperConfig::new(WhisperModel::BaseQ8_0);
        assert_eq!(config.model, WhisperModel::BaseQ8_0);
        assert!(config.model_path.is_none());
    }

    /// Integration test for local whisper transcription.
    ///
    /// This test requires:
    /// 1. A `test.wav` file in the project root (not committed to repo)
    /// 2. The tiny-q8_0 model to be downloaded (will download if missing)
    ///
    /// Run with: `cargo test --features local-whisper test_transcribe_wav -- --ignored --nocapture`
    ///
    /// Outputs `test-processed.wav` so you can hear the resampled audio.
    #[test]
    #[ignore]
    fn test_transcribe_wav() {
        use std::io::Cursor;

        use crate::transcribe::model::ensure_model;

        // Look for test.wav in project root
        let project_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
            .parent()
            .unwrap()
            .parent()
            .unwrap();
        let test_file = project_root.join("test.wav");

        if !test_file.exists() {
            eprintln!("Skipping test: {} not found", test_file.display());
            eprintln!("Place a test.wav file in the project root to run this test");
            return;
        }

        let audio_data = std::fs::read(&test_file).expect("Failed to read test.wav");
        eprintln!(
            "Read {} bytes from {}",
            audio_data.len(),
            test_file.display()
        );

        // Read and convert audio, then write processed version
        let processed_file = project_root.join("test-processed.wav");
        {
            let cursor = Cursor::new(&audio_data);
            let reader = hound::WavReader::new(cursor).expect("Failed to read WAV");
            let spec = reader.spec();
            let sample_rate = spec.sample_rate;
            let channels = spec.channels as usize;

            eprintln!(
                "Input: {}Hz, {} channels, {} bits",
                sample_rate, channels, spec.bits_per_sample
            );

            // Read samples as f32
            let samples: Vec<f32> = match spec.sample_format {
                hound::SampleFormat::Float => reader
                    .into_samples::<f32>()
                    .collect::<std::result::Result<Vec<_>, _>>()
                    .expect("Failed to read float samples"),
                hound::SampleFormat::Int => {
                    let bits = spec.bits_per_sample;
                    let max_val = (1u32 << (bits - 1)) as f32;
                    reader
                        .into_samples::<i32>()
                        .collect::<std::result::Result<Vec<_>, _>>()
                        .expect("Failed to read int samples")
                        .into_iter()
                        .map(|s| s as f32 / max_val)
                        .collect()
                }
            };

            // Convert to mono if stereo
            let mono_samples: Vec<f32> = if channels > 1 {
                samples
                    .chunks(channels)
                    .map(|chunk| chunk.iter().sum::<f32>() / channels as f32)
                    .collect()
            } else {
                samples
            };

            // Resample to 16kHz if needed
            let target_rate = 16000u32;
            let resampled = if sample_rate != target_rate {
                resample(&mono_samples, sample_rate, target_rate)
            } else {
                mono_samples
            };

            eprintln!(
                "Output: {}Hz, 1 channel, {} samples ({:.2}s)",
                target_rate,
                resampled.len(),
                resampled.len() as f64 / target_rate as f64
            );

            // Write processed audio
            let out_spec = hound::WavSpec {
                channels: 1,
                sample_rate: target_rate,
                bits_per_sample: 32,
                sample_format: hound::SampleFormat::Float,
            };
            let mut writer =
                hound::WavWriter::create(&processed_file, out_spec).expect("Failed to create WAV");
            for sample in &resampled {
                writer
                    .write_sample(*sample)
                    .expect("Failed to write sample");
            }
            writer.finalize().expect("Failed to finalize WAV");
            eprintln!("Wrote processed audio to {}", processed_file.display());
        }

        // Use the smallest quantized model for faster testing
        let model = WhisperModel::TinyQ8_0;

        // Ensure model is downloaded
        eprintln!("Ensuring model {:?} is available...", model);
        let rt = tokio::runtime::Runtime::new().unwrap();
        rt.block_on(async {
            ensure_model(model, |downloaded, total| {
                let percent = (downloaded as f64 / total as f64 * 100.0) as u32;
                if percent.is_multiple_of(25) {
                    eprintln!("Downloading model: {}%", percent);
                }
            })
            .await
            .expect("Failed to download model");
        });

        // Create client and transcribe
        let config = LocalWhisperConfig::new(model);
        let client = LocalWhisperClient::new(config);

        let result = rt.block_on(async { client.transcribe(audio_data.into(), None).await });

        match result {
            Ok(text) => {
                eprintln!("Transcription successful!");
                eprintln!("---");
                eprintln!("{}", text);
                eprintln!("---");
                assert!(!text.is_empty(), "Transcription should not be empty");
            }
            Err(e) => {
                panic!("Transcription failed: {:?}", e);
            }
        }
    }
}