transcribe_rs/lib.rs
1//! # transcribe-rs
2//!
3//! A Rust library providing unified transcription capabilities using multiple speech recognition engines.
4//! Currently supports Whisper and Parakeet (NeMo) models for accurate speech-to-text transcription.
5//!
6//! ## Features
7//!
8//! - **Multiple Engines**: Support for both Whisper and Parakeet transcription engines
9//! - **Flexible Model Loading**: Load models with custom parameters (quantization, etc.)
10//! - **Timestamped Results**: Get detailed timing information for transcribed segments
11//! - **Audio Processing**: Built-in WAV file processing with proper format validation
12//! - **Unified API**: Common trait-based interface for all transcription engines
13//!
14//! ## Model Format Requirements
15//!
16//! - **Whisper**: Expects a single GGML format file (e.g., `whisper-medium-q4_1.bin`)
17//! - **Parakeet**: Expects a directory containing the model files (e.g., `parakeet-v0.3/`)
18//!
19//! ## Quick Start
20//!
21//! ```rust,no_run
22//! use std::path::PathBuf;
23//! use transcribe_rs::{engines::whisper::WhisperEngine, TranscriptionEngine};
24//!
25//! let mut engine = WhisperEngine::new();
26//! engine.load_model(&PathBuf::from("models/whisper-medium-q4_1.bin"))?;
27//!
28//! let result = engine.transcribe_file(&PathBuf::from("audio.wav"), None)?;
29//! println!("Transcription: {}", result.text);
30//!
31//! if let Some(segments) = result.segments {
32//! for segment in segments {
33//! println!(
34//! "[{:.2}s - {:.2}s]: {}",
35//! segment.start, segment.end, segment.text
36//! );
37//! }
38//! }
39//! # Ok::<(), Box<dyn std::error::Error>>(())
40//! ```
41//!
42//! ## Audio Requirements
43//!
44//! Input audio files must be:
45//! - WAV format
46//! - 16 kHz sample rate
47//! - 16-bit samples
48//! - Mono (single channel)
49
50pub mod audio;
51pub mod engines;
52
53pub mod remote;
54pub use remote::RemoteTranscriptionEngine;
55
56use std::path::Path;
57
58/// The result of a transcription operation.
59///
60/// Contains both the full transcribed text and detailed timing information
61/// for individual segments within the audio.
62#[derive(Debug)]
63pub struct TranscriptionResult {
64 /// The complete transcribed text from the audio
65 pub text: String,
66 /// Individual segments with timing information
67 pub segments: Option<Vec<TranscriptionSegment>>,
68}
69
70/// A single transcribed segment with timing information.
71///
72/// Represents a portion of the transcribed audio with start and end timestamps
73/// and the corresponding text content.
74#[derive(Debug)]
75pub struct TranscriptionSegment {
76 /// Start time of the segment in seconds
77 pub start: f32,
78 /// End time of the segment in seconds
79 pub end: f32,
80 /// The transcribed text for this segment
81 pub text: String,
82}
83
84/// Common interface for speech transcription engines.
85///
86/// This trait defines the standard operations that all transcription engines must support.
87/// Each engine may have different parameter types for model loading and inference configuration.
88///
89/// # Examples
90///
91/// ## Using Whisper Engine
92///
93/// ```rust,no_run
94/// use std::path::PathBuf;
95/// use transcribe_rs::{engines::whisper::WhisperEngine, TranscriptionEngine};
96///
97/// let mut engine = WhisperEngine::new();
98/// engine.load_model(&PathBuf::from("models/whisper-medium-q4_1.bin"))?;
99///
100/// let result = engine.transcribe_file(&PathBuf::from("audio.wav"), None)?;
101/// println!("Transcription: {}", result.text);
102/// # Ok::<(), Box<dyn std::error::Error>>(())
103/// ```
104///
105/// ## Using Parakeet Engine
106///
107/// ```rust,no_run
108/// use std::path::PathBuf;
109/// use transcribe_rs::{
110/// engines::parakeet::{ParakeetEngine, ParakeetModelParams},
111/// TranscriptionEngine,
112/// };
113///
114/// let mut engine = ParakeetEngine::new();
115/// engine.load_model_with_params(
116/// &PathBuf::from("models/parakeet-v0.3"),
117/// ParakeetModelParams::int8(),
118/// )?;
119///
120/// let result = engine.transcribe_file(&PathBuf::from("audio.wav"), None)?;
121/// println!("Transcription: {}", result.text);
122/// # Ok::<(), Box<dyn std::error::Error>>(())
123/// ```
124pub trait TranscriptionEngine {
125 /// Parameters for configuring inference behavior (language, timestamps, etc.)
126 type InferenceParams;
127 /// Parameters for configuring model loading (quantization, etc.)
128 type ModelParams: Default;
129
130 /// Load a model from the specified path using default parameters.
131 ///
132 /// # Arguments
133 ///
134 /// * `model_path` - Path to the model file or directory
135 ///
136 /// # Returns
137 ///
138 /// Returns `Ok(())` if the model loads successfully, or an error if loading fails.
139 fn load_model(&mut self, model_path: &Path) -> Result<(), Box<dyn std::error::Error>> {
140 self.load_model_with_params(model_path, Self::ModelParams::default())
141 }
142
143 /// Load a model from the specified path with custom parameters.
144 ///
145 /// # Arguments
146 ///
147 /// * `model_path` - Path to the model file or directory
148 /// * `params` - Engine-specific model loading parameters
149 ///
150 /// # Returns
151 ///
152 /// Returns `Ok(())` if the model loads successfully, or an error if loading fails.
153 fn load_model_with_params(
154 &mut self,
155 model_path: &Path,
156 params: Self::ModelParams,
157 ) -> Result<(), Box<dyn std::error::Error>>;
158
159 /// Unload the currently loaded model and free associated resources.
160 fn unload_model(&mut self);
161
162 /// Transcribe audio samples directly.
163 ///
164 /// # Arguments
165 ///
166 /// * `samples` - Audio samples as f32 values (16kHz, mono)
167 /// * `params` - Optional engine-specific inference parameters
168 ///
169 /// # Returns
170 ///
171 /// Returns transcription result with text and timing information.
172 fn transcribe_samples(
173 &mut self,
174 samples: Vec<f32>,
175 params: Option<Self::InferenceParams>,
176 ) -> Result<TranscriptionResult, Box<dyn std::error::Error>>;
177
178 /// Transcribe audio from a WAV file.
179 ///
180 /// The WAV file must meet the following requirements:
181 /// - 16 kHz sample rate
182 /// - 16-bit samples
183 /// - Mono (single channel)
184 /// - PCM format
185 ///
186 /// # Arguments
187 ///
188 /// * `wav_path` - Path to the WAV file to transcribe
189 /// * `params` - Optional engine-specific inference parameters
190 ///
191 /// # Returns
192 ///
193 /// Returns transcription result with text and timing information.
194 fn transcribe_file(
195 &mut self,
196 wav_path: &Path,
197 params: Option<Self::InferenceParams>,
198 ) -> Result<TranscriptionResult, Box<dyn std::error::Error>> {
199 let samples = audio::read_wav_samples(wav_path)?;
200 self.transcribe_samples(samples, params)
201 }
202}