transcribe-rs 0.3.4

A simple library to help you transcribe audio
Documentation
//! Chunked transcription strategies.
//!
//! The [`Transcriber`] trait provides a universal wrapper for non-trivial
//! transcription workflows. Different implementations handle different
//! chunking approaches:
//!
//! - [`VadChunked`] — splits audio on speech/silence boundaries using a [`Vad`](crate::vad::Vad)
//! - [`EnergyAdaptiveChunked`] — fixed-duration chunks with energy-based split point search
//!
//! The model is borrowed per-call (`&mut dyn SpeechModel`), never owned.
//! This works naturally with `Arc<Mutex<Box<dyn SpeechModel>>>` — lock for
//! the call, unlock after.
//!
//! # Examples
//!
//! **File transcription with VAD chunking:**
//! ```ignore
//! use transcribe_rs::transcriber::{Transcriber, VadChunked, VadChunkedConfig};
//! use transcribe_rs::vad::{SmoothedVad, EnergyVad};
//!
//! let vad = SmoothedVad::new(Box::new(EnergyVad::new(480, 0.01)), 15, 15, 2);
//! let mut chunker = VadChunked::new(Box::new(vad), VadChunkedConfig::default(), options);
//! let result = chunker.transcribe_file(&mut model, &path)?;
//! ```
//!
//! **Live audio with per-frame feeding:**
//! ```ignore
//! let mut t = VadChunked::new(vad, config, options);
//! for frame in audio_frames {
//!     for result in t.feed(&mut model, &frame)? {
//!         println!("{}", result.text);
//!     }
//! }
//! let final_result = t.finish(&mut model)?;
//! ```

mod energy_adaptive_chunked;
mod merge;
#[cfg(test)]
pub(crate) mod test_helpers;
mod vad_chunked;

pub use energy_adaptive_chunked::{EnergyAdaptiveChunked, EnergyAdaptiveConfig};
pub use merge::{merge_sequential, merge_sequential_with_separator, DEFAULT_MERGE_SEPARATOR};
pub use vad_chunked::{VadChunked, VadChunkedConfig};

/// Expected sample rate for all transcription audio.
pub(crate) const SAMPLE_RATE: f32 = 16000.0;

/// Compute RMS energy of an audio frame.
pub(crate) fn rms_energy(frame: &[f32]) -> f32 {
    if frame.is_empty() {
        return 0.0;
    }
    (frame.iter().map(|s| s * s).sum::<f32>() / frame.len() as f32).sqrt()
}

use std::path::Path;

use crate::{audio, SpeechModel, TranscribeError, TranscribeOptions, TranscriptionResult};

/// Transcribe a chunk with optional silence padding and timestamp adjustment.
///
/// Prepends and appends `padding_secs` of silence, enforces `min_duration_secs`
/// via zero-padding, calls `model.transcribe`, then adjusts segment timestamps
/// to account for the padding and the chunk's position in the overall audio.
pub(crate) fn transcribe_padded(
    model: &mut dyn SpeechModel,
    samples: &[f32],
    padding_secs: f32,
    min_duration_secs: f32,
    chunk_start_secs: f32,
    options: &TranscribeOptions,
) -> Result<TranscriptionResult, TranscribeError> {
    let pad_samples = (padding_secs * SAMPLE_RATE) as usize;
    let mut padded = Vec::with_capacity(pad_samples + samples.len() + pad_samples);
    padded.resize(pad_samples, 0.0);
    padded.extend_from_slice(samples);
    padded.resize(padded.len() + pad_samples, 0.0);

    let min_samples = (min_duration_secs * SAMPLE_RATE) as usize;
    if padded.len() < min_samples {
        padded.resize(min_samples, 0.0);
    }

    let mut result = model.transcribe(&padded, options)?;

    if let Some(segments) = &mut result.segments {
        for seg in segments.iter_mut() {
            seg.start = (seg.start - padding_secs + chunk_start_secs).max(0.0);
            seg.end = (seg.end - padding_secs + chunk_start_secs).max(0.0);
        }
    }

    Ok(result)
}

/// A chunked transcription strategy.
///
/// Implementations split audio into chunks, transcribe each chunk via
/// a [`SpeechModel`], and merge the results. The model is borrowed
/// per-call, never owned.
///
/// All methods take `&mut self`, making the trait fully object-safe
/// (`Box<dyn Transcriber>` works). After [`finish()`](Transcriber::finish)
/// returns, the transcriber is reset and ready for a new session.
pub trait Transcriber: Send {
    /// Feed audio samples. Internally buffers, detects chunk boundaries,
    /// and transcribes chunks as they become ready.
    ///
    /// Returns all chunk results that completed during this call.
    ///
    /// For live audio: called per-frame (e.g. 480 samples), usually returns
    /// an empty vec, occasionally returns 1 result when a chunk boundary
    /// is detected.
    ///
    /// For files: called with all samples at once, returns N results
    /// (one per chunk that completed during processing).
    fn feed(
        &mut self,
        model: &mut dyn SpeechModel,
        samples: &[f32],
    ) -> Result<Vec<TranscriptionResult>, TranscribeError>;

    /// Transcribe any remaining buffered audio and return the merged
    /// result of the entire session. Resets internal state for reuse.
    fn finish(
        &mut self,
        model: &mut dyn SpeechModel,
    ) -> Result<TranscriptionResult, TranscribeError>;

    /// Convenience: feed all samples and finish in one call.
    ///
    /// The return value of `feed()` is intentionally discarded here —
    /// intermediate results are accumulated internally and merged by
    /// `finish()`.
    fn transcribe(
        &mut self,
        model: &mut dyn SpeechModel,
        samples: &[f32],
    ) -> Result<TranscriptionResult, TranscribeError> {
        self.feed(model, samples)?;
        self.finish(model)
    }

    /// Convenience: load a WAV file, feed, and finish.
    fn transcribe_file(
        &mut self,
        model: &mut dyn SpeechModel,
        path: &Path,
    ) -> Result<TranscriptionResult, TranscribeError> {
        let samples = audio::read_wav_samples(path)?;
        self.transcribe(model, &samples)
    }
}