transcribe_rs/lib.rs
1//! # transcribe-rs
2//!
3//! A Rust library providing unified transcription capabilities using multiple speech recognition engines.
4//!
5//! ## Features
6//!
7//! - **ONNX Models**: SenseVoice, GigaAM, Parakeet, Moonshine (requires `onnx` feature)
8//! - **Whisper**: OpenAI Whisper via GGML (requires `whisper-cpp` feature)
9//! - **Whisperfile**: Mozilla Whisperfile server (requires `whisperfile` feature)
10//! - **Remote**: OpenAI API (requires `openai` feature)
11//! - **Timestamped Results**: Detailed timing information for transcribed segments
12//! - **Unified API**: `SpeechModel` trait for all local engines
13//! - **Hardware Acceleration**: GPU support for ORT engines (`ort-cuda`, `ort-rocm`,
14//! `ort-directml`, `ort-coreml`, `ort-webgpu`) and whisper.cpp (Metal/Vulkan)
15//! via the [`accel`] module
16//!
17//! ## Backend Categories
18//!
19//! This crate provides two categories of transcription backend:
20//!
21//! - **Local models** implement [`SpeechModel`] and run inference in-process or via
22//! a local binary. This includes all ONNX models, Whisper (via whisper.cpp), and
23//! Whisperfile.
24//! - **Remote services** implement [`RemoteTranscriptionEngine`] (requires `openai`
25//! feature) and make async network calls to external APIs. This includes OpenAI.
26//!
27//! These traits are intentionally separate because the execution model differs:
28//! local models are synchronous and take audio samples directly, while remote
29//! services are async and may only accept file uploads.
30//!
31//! ## Quick Start
32//!
33//! ```toml
34//! [dependencies]
35//! transcribe-rs = { version = "0.3", features = ["onnx"] }
36//! ```
37//!
38//! ```ignore
39//! use std::path::PathBuf;
40//! use transcribe_rs::onnx::sense_voice::{SenseVoiceModel, SenseVoiceParams};
41//! use transcribe_rs::onnx::Quantization;
42//! use transcribe_rs::SpeechModel;
43//!
44//! let mut model = SenseVoiceModel::load(
45//! &PathBuf::from("models/sense-voice"),
46//! &Quantization::Int8,
47//! )?;
48//!
49//! let result = model.transcribe(&samples, &transcribe_rs::TranscribeOptions::default())?;
50//! println!("Transcription: {}", result.text);
51//! # Ok::<(), Box<dyn std::error::Error>>(())
52//! ```
53//!
54//! ## Audio Requirements
55//!
56//! Input audio files must be:
57//! - WAV format
58//! - 16 kHz sample rate
59//! - 16-bit samples
60//! - Mono (single channel)
61//!
62//! ## Migrating from 0.2.x to 0.3.0
63//!
64//! Version 0.3.0 is a breaking release. If you need the old API, pin to `version = "=0.2.9"`.
65//!
66//! **`SpeechModel::transcribe` signature changed:**
67//!
68//! ```rust,ignore
69//! // Before (0.2.x):
70//! model.transcribe(&samples, Some("en"))?;
71//! model.transcribe_file(&path, None)?;
72//!
73//! // After (0.3.0):
74//! use transcribe_rs::TranscribeOptions;
75//! model.transcribe(&samples, &TranscribeOptions { language: Some("en".into()), ..Default::default() })?;
76//! model.transcribe_file(&path, &TranscribeOptions::default())?;
77//! ```
78//!
79//! **`SpeechModel` now requires `Send`**, enabling `Box<dyn SpeechModel + Send>` for
80//! use across threads.
81//!
82//! **`TranscribeOptions` includes a `translate` field** (default `false`). Engines that
83//! support translation (Whisper, Whisperfile) will translate to English when set to `true`.
84//!
85//! **Whisper capabilities are now dynamic.** `WhisperEngine::capabilities()` returns the
86//! actual language support of the loaded model (English-only vs multilingual) rather than
87//! always reporting all 99 languages.
88
89pub mod accel;
90pub mod audio;
91pub mod error;
92pub use accel::{
93 get_ort_accelerator, get_whisper_accelerator, get_whisper_gpu_device, set_ort_accelerator,
94 set_whisper_accelerator, set_whisper_gpu_device, OrtAccelerator, WhisperAccelerator,
95 GPU_DEVICE_AUTO,
96};
97pub use error::TranscribeError;
98
99#[cfg(feature = "audio-features")]
100pub mod decode;
101#[cfg(feature = "audio-features")]
102pub mod features;
103#[cfg(feature = "onnx")]
104pub mod onnx;
105
106pub mod transcriber;
107pub mod vad;
108
109#[cfg(feature = "whisper-cpp")]
110pub mod whisper_cpp;
111#[cfg(feature = "whisperfile")]
112pub mod whisperfile;
113
114#[cfg(feature = "openai")]
115pub mod remote;
116#[cfg(feature = "openai")]
117pub use remote::RemoteTranscriptionEngine;
118
119use std::path::Path;
120
121/// Describes the capabilities of a speech model.
122#[derive(Debug, Clone)]
123pub struct ModelCapabilities {
124 /// Human-readable model name.
125 pub name: &'static str,
126 /// Machine-friendly engine identifier (e.g. "sense_voice", "whisper_cpp").
127 pub engine_id: &'static str,
128 /// Expected input sample rate in Hz (e.g. 16000).
129 pub sample_rate: u32,
130 /// Languages supported (BCP-47 codes, e.g. "en", "zh"). Empty = any/unknown.
131 pub languages: &'static [&'static str],
132 /// Whether the model can produce word/segment timestamps.
133 pub supports_timestamps: bool,
134 /// Whether the model can translate to English.
135 pub supports_translation: bool,
136 /// Whether the model supports streaming inference.
137 pub supports_streaming: bool,
138}
139
140/// Options for transcription.
141#[derive(Debug, Clone, Default)]
142pub struct TranscribeOptions {
143 /// Language hint (BCP-47 code, e.g. "en", "zh").
144 /// Multilingual models use this as a hint; single-language models ignore it.
145 pub language: Option<String>,
146 /// Whether to translate the output to English (only supported by some engines).
147 pub translate: bool,
148 /// Leading silence padding in milliseconds prepended before audio.
149 /// Some models (e.g. Parakeet) can drop the beginning of audio due to
150 /// mel spectrogram windowing. Set to `Some(0)` to explicitly disable.
151 /// When `None`, each engine applies its own default (e.g. Parakeet uses 250 ms).
152 pub leading_silence_ms: Option<u32>,
153 /// Trailing silence padding in milliseconds appended after audio.
154 /// Set to `Some(0)` to explicitly disable.
155 /// When `None`, each engine applies its own default (typically 0 ms).
156 pub trailing_silence_ms: Option<u32>,
157}
158
159/// Unified interface for speech-to-text models.
160///
161/// Each model implements this trait to provide a common transcription API.
162/// Model-specific parameters are exposed via a separate `transcribe_with()` method
163/// on the concrete type.
164///
165/// Engines implement [`transcribe_raw`](SpeechModel::transcribe_raw) with their
166/// inference logic. The default [`transcribe`](SpeechModel::transcribe) method
167/// handles silence padding and timestamp adjustment automatically.
168pub trait SpeechModel: Send {
169 /// Report this model's capabilities.
170 fn capabilities(&self) -> ModelCapabilities;
171
172 /// Default leading silence in milliseconds for this engine.
173 ///
174 /// Override to set a non-zero default. For example, Parakeet returns 250
175 /// because its mel spectrogram preprocessor attenuates the start of audio.
176 fn default_leading_silence_ms(&self) -> u32 {
177 0
178 }
179
180 /// Default trailing silence in milliseconds for this engine.
181 fn default_trailing_silence_ms(&self) -> u32 {
182 0
183 }
184
185 /// Raw transcription — engines implement this with their inference logic.
186 ///
187 /// Callers should prefer [`transcribe`](SpeechModel::transcribe) which
188 /// handles silence padding automatically. Use this method directly only
189 /// when managing padding yourself (e.g. chunked transcription).
190 fn transcribe_raw(
191 &mut self,
192 samples: &[f32],
193 options: &TranscribeOptions,
194 ) -> Result<TranscriptionResult, TranscribeError>;
195
196 /// Transcribe audio samples (16 kHz, mono, f32 in [-1, 1]).
197 ///
198 /// Prepends/appends silence padding based on [`TranscribeOptions`] (or
199 /// engine defaults), runs [`transcribe_raw`](SpeechModel::transcribe_raw),
200 /// then adjusts segment timestamps to account for the leading padding.
201 fn transcribe(
202 &mut self,
203 samples: &[f32],
204 options: &TranscribeOptions,
205 ) -> Result<TranscriptionResult, TranscribeError> {
206 let lead_ms = options
207 .leading_silence_ms
208 .unwrap_or_else(|| self.default_leading_silence_ms());
209 let trail_ms = options
210 .trailing_silence_ms
211 .unwrap_or_else(|| self.default_trailing_silence_ms());
212
213 // Fast path: no padding needed.
214 if lead_ms == 0 && trail_ms == 0 {
215 return self.transcribe_raw(samples, options);
216 }
217
218 let mut buf = if lead_ms > 0 {
219 audio::prepend_silence(samples, lead_ms)
220 } else {
221 samples.to_vec()
222 };
223 if trail_ms > 0 {
224 let trail_len = trail_ms as usize * audio::SAMPLES_PER_MS;
225 buf.resize(buf.len() + trail_len, 0.0);
226 }
227
228 let mut result = self.transcribe_raw(&buf, options)?;
229
230 if lead_ms > 0 {
231 result.offset_timestamps(-(lead_ms as f32 / 1000.0));
232 }
233
234 Ok(result)
235 }
236
237 /// Transcribe a WAV file (16 kHz, 16-bit, mono).
238 fn transcribe_file(
239 &mut self,
240 wav_path: &Path,
241 options: &TranscribeOptions,
242 ) -> Result<TranscriptionResult, TranscribeError> {
243 let samples = audio::read_wav_samples(wav_path)?;
244 self.transcribe(&samples, options)
245 }
246}
247
248/// The result of a transcription operation.
249///
250/// Contains both the full transcribed text and detailed timing information
251/// for individual segments within the audio.
252#[derive(Debug, Clone)]
253pub struct TranscriptionResult {
254 /// The complete transcribed text from the audio
255 pub text: String,
256 /// Individual segments with timing information
257 pub segments: Option<Vec<TranscriptionSegment>>,
258}
259
260impl TranscriptionResult {
261 /// Shift all segment timestamps by `offset_secs`, clamping to zero.
262 ///
263 /// Use a negative offset to compensate for leading silence padding,
264 /// or a positive offset to place a chunk within a longer audio stream.
265 pub fn offset_timestamps(&mut self, offset_secs: f32) {
266 if let Some(segs) = &mut self.segments {
267 for seg in segs {
268 seg.start = (seg.start + offset_secs).max(0.0);
269 seg.end = (seg.end + offset_secs).max(0.0);
270 }
271 }
272 }
273}
274
275/// A single transcribed segment with timing information.
276///
277/// Represents a portion of the transcribed audio with start and end timestamps
278/// and the corresponding text content.
279#[derive(Debug, Clone)]
280pub struct TranscriptionSegment {
281 /// Start time of the segment in seconds
282 pub start: f32,
283 /// End time of the segment in seconds
284 pub end: f32,
285 /// The transcribed text for this segment
286 pub text: String,
287}