Skip to main content

transcribe_rs/
lib.rs

1//! # transcribe-rs
2//!
3//! A Rust library providing unified transcription capabilities using multiple speech recognition engines.
4//!
5//! ## Features
6//!
7//! - **ONNX Models**: SenseVoice, GigaAM, Parakeet, Moonshine (requires `onnx` feature)
8//! - **Whisper**: OpenAI Whisper via GGML (requires `whisper-cpp` feature)
9//! - **Whisperfile**: Mozilla Whisperfile server (requires `whisperfile` feature)
10//! - **Remote**: OpenAI API (requires `openai` feature)
11//! - **Timestamped Results**: Detailed timing information for transcribed segments
12//! - **Unified API**: `SpeechModel` trait for all local engines
13//! - **Hardware Acceleration**: GPU support for ORT engines (`ort-cuda`, `ort-rocm`,
14//!   `ort-directml`, `ort-coreml`, `ort-webgpu`) and whisper.cpp (Metal/Vulkan)
15//!   via the [`accel`] module
16//!
17//! ## Backend Categories
18//!
19//! This crate provides two categories of transcription backend:
20//!
21//! - **Local models** implement [`SpeechModel`] and run inference in-process or via
22//!   a local binary. This includes all ONNX models, Whisper (via whisper.cpp), and
23//!   Whisperfile.
24//! - **Remote services** implement [`RemoteTranscriptionEngine`] (requires `openai`
25//!   feature) and make async network calls to external APIs. This includes OpenAI.
26//!
27//! These traits are intentionally separate because the execution model differs:
28//! local models are synchronous and take audio samples directly, while remote
29//! services are async and may only accept file uploads.
30//!
31//! ## Quick Start
32//!
33//! ```toml
34//! [dependencies]
35//! transcribe-rs = { version = "0.3", features = ["onnx"] }
36//! ```
37//!
38//! ```ignore
39//! use std::path::PathBuf;
40//! use transcribe_rs::onnx::sense_voice::{SenseVoiceModel, SenseVoiceParams};
41//! use transcribe_rs::onnx::Quantization;
42//! use transcribe_rs::SpeechModel;
43//!
44//! let mut model = SenseVoiceModel::load(
45//!     &PathBuf::from("models/sense-voice"),
46//!     &Quantization::Int8,
47//! )?;
48//!
49//! let result = model.transcribe(&samples, &transcribe_rs::TranscribeOptions::default())?;
50//! println!("Transcription: {}", result.text);
51//! # Ok::<(), Box<dyn std::error::Error>>(())
52//! ```
53//!
54//! ## Audio Requirements
55//!
56//! Input audio files must be:
57//! - WAV format
58//! - 16 kHz sample rate
59//! - 16-bit samples
60//! - Mono (single channel)
61//!
62//! ## Migrating from 0.2.x to 0.3.0
63//!
64//! Version 0.3.0 is a breaking release. If you need the old API, pin to `version = "=0.2.9"`.
65//!
66//! **`SpeechModel::transcribe` signature changed:**
67//!
68//! ```rust,ignore
69//! // Before (0.2.x):
70//! model.transcribe(&samples, Some("en"))?;
71//! model.transcribe_file(&path, None)?;
72//!
73//! // After (0.3.0):
74//! use transcribe_rs::TranscribeOptions;
75//! model.transcribe(&samples, &TranscribeOptions { language: Some("en".into()), ..Default::default() })?;
76//! model.transcribe_file(&path, &TranscribeOptions::default())?;
77//! ```
78//!
79//! **`SpeechModel` now requires `Send`**, enabling `Box<dyn SpeechModel + Send>` for
80//! use across threads.
81//!
82//! **`TranscribeOptions` includes a `translate` field** (default `false`). Engines that
83//! support translation (Whisper, Whisperfile) will translate to English when set to `true`.
84//!
85//! **Whisper capabilities are now dynamic.** `WhisperEngine::capabilities()` returns the
86//! actual language support of the loaded model (English-only vs multilingual) rather than
87//! always reporting all 99 languages.
88
89pub mod accel;
90pub mod audio;
91pub mod error;
92pub use accel::{
93    get_ort_accelerator, get_whisper_accelerator, get_whisper_gpu_device, set_ort_accelerator,
94    set_whisper_accelerator, set_whisper_gpu_device, OrtAccelerator, WhisperAccelerator,
95    GPU_DEVICE_AUTO,
96};
97pub use error::TranscribeError;
98
99#[cfg(feature = "audio-features")]
100pub mod decode;
101#[cfg(feature = "audio-features")]
102pub mod features;
103#[cfg(feature = "onnx")]
104pub mod onnx;
105
106pub mod transcriber;
107pub mod vad;
108
109#[cfg(feature = "whisper-cpp")]
110pub mod whisper_cpp;
111#[cfg(feature = "whisperfile")]
112pub mod whisperfile;
113
114#[cfg(feature = "openai")]
115pub mod remote;
116#[cfg(feature = "openai")]
117pub use remote::RemoteTranscriptionEngine;
118
119use std::path::Path;
120
121/// Describes the capabilities of a speech model.
122#[derive(Debug, Clone)]
123pub struct ModelCapabilities {
124    /// Human-readable model name.
125    pub name: &'static str,
126    /// Machine-friendly engine identifier (e.g. "sense_voice", "whisper_cpp").
127    pub engine_id: &'static str,
128    /// Expected input sample rate in Hz (e.g. 16000).
129    pub sample_rate: u32,
130    /// Languages supported (BCP-47 codes, e.g. "en", "zh"). Empty = any/unknown.
131    pub languages: &'static [&'static str],
132    /// Whether the model can produce word/segment timestamps.
133    pub supports_timestamps: bool,
134    /// Whether the model can translate to English.
135    pub supports_translation: bool,
136    /// Whether the model supports streaming inference.
137    pub supports_streaming: bool,
138}
139
140/// Options for transcription.
141#[derive(Debug, Clone, Default)]
142pub struct TranscribeOptions {
143    /// Language hint (BCP-47 code, e.g. "en", "zh").
144    /// Multilingual models use this as a hint; single-language models ignore it.
145    pub language: Option<String>,
146    /// Whether to translate the output to English (only supported by some engines).
147    pub translate: bool,
148    /// Leading silence padding in milliseconds prepended before audio.
149    /// Some models (e.g. Parakeet) can drop the beginning of audio due to
150    /// mel spectrogram windowing. Set to `Some(0)` to explicitly disable.
151    /// When `None`, each engine applies its own default (e.g. Parakeet uses 250 ms).
152    pub leading_silence_ms: Option<u32>,
153    /// Trailing silence padding in milliseconds appended after audio.
154    /// Set to `Some(0)` to explicitly disable.
155    /// When `None`, each engine applies its own default (typically 0 ms).
156    pub trailing_silence_ms: Option<u32>,
157}
158
159/// Unified interface for speech-to-text models.
160///
161/// Each model implements this trait to provide a common transcription API.
162/// Model-specific parameters are exposed via a separate `transcribe_with()` method
163/// on the concrete type.
164///
165/// Engines implement [`transcribe_raw`](SpeechModel::transcribe_raw) with their
166/// inference logic. The default [`transcribe`](SpeechModel::transcribe) method
167/// handles silence padding and timestamp adjustment automatically.
168pub trait SpeechModel: Send {
169    /// Report this model's capabilities.
170    fn capabilities(&self) -> ModelCapabilities;
171
172    /// Default leading silence in milliseconds for this engine.
173    ///
174    /// Override to set a non-zero default. For example, Parakeet returns 250
175    /// because its mel spectrogram preprocessor attenuates the start of audio.
176    fn default_leading_silence_ms(&self) -> u32 {
177        0
178    }
179
180    /// Default trailing silence in milliseconds for this engine.
181    fn default_trailing_silence_ms(&self) -> u32 {
182        0
183    }
184
185    /// Raw transcription — engines implement this with their inference logic.
186    ///
187    /// Callers should prefer [`transcribe`](SpeechModel::transcribe) which
188    /// handles silence padding automatically. Use this method directly only
189    /// when managing padding yourself (e.g. chunked transcription).
190    fn transcribe_raw(
191        &mut self,
192        samples: &[f32],
193        options: &TranscribeOptions,
194    ) -> Result<TranscriptionResult, TranscribeError>;
195
196    /// Transcribe audio samples (16 kHz, mono, f32 in [-1, 1]).
197    ///
198    /// Prepends/appends silence padding based on [`TranscribeOptions`] (or
199    /// engine defaults), runs [`transcribe_raw`](SpeechModel::transcribe_raw),
200    /// then adjusts segment timestamps to account for the leading padding.
201    fn transcribe(
202        &mut self,
203        samples: &[f32],
204        options: &TranscribeOptions,
205    ) -> Result<TranscriptionResult, TranscribeError> {
206        let lead_ms = options
207            .leading_silence_ms
208            .unwrap_or_else(|| self.default_leading_silence_ms());
209        let trail_ms = options
210            .trailing_silence_ms
211            .unwrap_or_else(|| self.default_trailing_silence_ms());
212
213        // Fast path: no padding needed.
214        if lead_ms == 0 && trail_ms == 0 {
215            return self.transcribe_raw(samples, options);
216        }
217
218        let mut buf = if lead_ms > 0 {
219            audio::prepend_silence(samples, lead_ms)
220        } else {
221            samples.to_vec()
222        };
223        if trail_ms > 0 {
224            let trail_len = trail_ms as usize * audio::SAMPLES_PER_MS;
225            buf.resize(buf.len() + trail_len, 0.0);
226        }
227
228        let mut result = self.transcribe_raw(&buf, options)?;
229
230        if lead_ms > 0 {
231            result.offset_timestamps(-(lead_ms as f32 / 1000.0));
232        }
233
234        Ok(result)
235    }
236
237    /// Transcribe a WAV file (16 kHz, 16-bit, mono).
238    fn transcribe_file(
239        &mut self,
240        wav_path: &Path,
241        options: &TranscribeOptions,
242    ) -> Result<TranscriptionResult, TranscribeError> {
243        let samples = audio::read_wav_samples(wav_path)?;
244        self.transcribe(&samples, options)
245    }
246}
247
248/// The result of a transcription operation.
249///
250/// Contains both the full transcribed text and detailed timing information
251/// for individual segments within the audio.
252#[derive(Debug, Clone)]
253pub struct TranscriptionResult {
254    /// The complete transcribed text from the audio
255    pub text: String,
256    /// Individual segments with timing information
257    pub segments: Option<Vec<TranscriptionSegment>>,
258}
259
260impl TranscriptionResult {
261    /// Shift all segment timestamps by `offset_secs`, clamping to zero.
262    ///
263    /// Use a negative offset to compensate for leading silence padding,
264    /// or a positive offset to place a chunk within a longer audio stream.
265    pub fn offset_timestamps(&mut self, offset_secs: f32) {
266        if let Some(segs) = &mut self.segments {
267            for seg in segs {
268                seg.start = (seg.start + offset_secs).max(0.0);
269                seg.end = (seg.end + offset_secs).max(0.0);
270            }
271        }
272    }
273}
274
275/// A single transcribed segment with timing information.
276///
277/// Represents a portion of the transcribed audio with start and end timestamps
278/// and the corresponding text content.
279#[derive(Debug, Clone)]
280pub struct TranscriptionSegment {
281    /// Start time of the segment in seconds
282    pub start: f32,
283    /// End time of the segment in seconds
284    pub end: f32,
285    /// The transcribed text for this segment
286    pub text: String,
287}