transcribe_rs/
lib.rs

1//! # transcribe-rs
2//!
3//! A Rust library providing unified transcription capabilities using multiple speech recognition engines.
4//! Currently supports Whisper and Parakeet (NeMo) models for accurate speech-to-text transcription.
5//!
6//! ## Features
7//!
8//! - **Multiple Engines**: Support for both Whisper and Parakeet transcription engines
9//! - **Flexible Model Loading**: Load models with custom parameters (quantization, etc.)
10//! - **Timestamped Results**: Get detailed timing information for transcribed segments
11//! - **Audio Processing**: Built-in WAV file processing with proper format validation
12//! - **Unified API**: Common trait-based interface for all transcription engines
13//!
14//! ## Model Format Requirements
15//!
16//! - **Whisper**: Expects a single GGML format file (e.g., `whisper-medium-q4_1.bin`)
17//! - **Parakeet**: Expects a directory containing the model files (e.g., `parakeet-v0.3/`)
18//!
19//! ## Quick Start
20//!
21//! ```rust,no_run
22//! use std::path::PathBuf;
23//! use transcribe_rs::{engines::whisper::WhisperEngine, TranscriptionEngine};
24//!
25//! let mut engine = WhisperEngine::new();
26//! engine.load_model(&PathBuf::from("models/whisper-medium-q4_1.bin"))?;
27//!
28//! let result = engine.transcribe_file(&PathBuf::from("audio.wav"), None)?;
29//! println!("Transcription: {}", result.text);
30//!
31//! if let Some(segments) = result.segments {
32//!     for segment in segments {
33//!         println!(
34//!             "[{:.2}s - {:.2}s]: {}",
35//!             segment.start, segment.end, segment.text
36//!         );
37//!     }
38//! }
39//! # Ok::<(), Box<dyn std::error::Error>>(())
40//! ```
41//!
42//! ## Audio Requirements
43//!
44//! Input audio files must be:
45//! - WAV format
46//! - 16 kHz sample rate
47//! - 16-bit samples
48//! - Mono (single channel)
49
50pub mod audio;
51pub mod engines;
52
53pub mod remote;
54pub use remote::RemoteTranscriptionEngine;
55
56use std::path::Path;
57
58/// The result of a transcription operation.
59///
60/// Contains both the full transcribed text and detailed timing information
61/// for individual segments within the audio.
62#[derive(Debug)]
63pub struct TranscriptionResult {
64    /// The complete transcribed text from the audio
65    pub text: String,
66    /// Individual segments with timing information
67    pub segments: Option<Vec<TranscriptionSegment>>,
68}
69
70/// A single transcribed segment with timing information.
71///
72/// Represents a portion of the transcribed audio with start and end timestamps
73/// and the corresponding text content.
74#[derive(Debug)]
75pub struct TranscriptionSegment {
76    /// Start time of the segment in seconds
77    pub start: f32,
78    /// End time of the segment in seconds
79    pub end: f32,
80    /// The transcribed text for this segment
81    pub text: String,
82}
83
84/// Common interface for speech transcription engines.
85///
86/// This trait defines the standard operations that all transcription engines must support.
87/// Each engine may have different parameter types for model loading and inference configuration.
88///
89/// # Examples
90///
91/// ## Using Whisper Engine
92///
93/// ```rust,no_run
94/// use std::path::PathBuf;
95/// use transcribe_rs::{engines::whisper::WhisperEngine, TranscriptionEngine};
96///
97/// let mut engine = WhisperEngine::new();
98/// engine.load_model(&PathBuf::from("models/whisper-medium-q4_1.bin"))?;
99///
100/// let result = engine.transcribe_file(&PathBuf::from("audio.wav"), None)?;
101/// println!("Transcription: {}", result.text);
102/// # Ok::<(), Box<dyn std::error::Error>>(())
103/// ```
104///
105/// ## Using Parakeet Engine
106///
107/// ```rust,no_run
108/// use std::path::PathBuf;
109/// use transcribe_rs::{
110///     engines::parakeet::{ParakeetEngine, ParakeetModelParams},
111///     TranscriptionEngine,
112/// };
113///
114/// let mut engine = ParakeetEngine::new();
115/// engine.load_model_with_params(
116///     &PathBuf::from("models/parakeet-v0.3"),
117///     ParakeetModelParams::int8(),
118/// )?;
119///
120/// let result = engine.transcribe_file(&PathBuf::from("audio.wav"), None)?;
121/// println!("Transcription: {}", result.text);
122/// # Ok::<(), Box<dyn std::error::Error>>(())
123/// ```
124pub trait TranscriptionEngine {
125    /// Parameters for configuring inference behavior (language, timestamps, etc.)
126    type InferenceParams;
127    /// Parameters for configuring model loading (quantization, etc.)
128    type ModelParams: Default;
129
130    /// Load a model from the specified path using default parameters.
131    ///
132    /// # Arguments
133    ///
134    /// * `model_path` - Path to the model file or directory
135    ///
136    /// # Returns
137    ///
138    /// Returns `Ok(())` if the model loads successfully, or an error if loading fails.
139    fn load_model(&mut self, model_path: &Path) -> Result<(), Box<dyn std::error::Error>> {
140        self.load_model_with_params(model_path, Self::ModelParams::default())
141    }
142
143    /// Load a model from the specified path with custom parameters.
144    ///
145    /// # Arguments
146    ///
147    /// * `model_path` - Path to the model file or directory
148    /// * `params` - Engine-specific model loading parameters
149    ///
150    /// # Returns
151    ///
152    /// Returns `Ok(())` if the model loads successfully, or an error if loading fails.
153    fn load_model_with_params(
154        &mut self,
155        model_path: &Path,
156        params: Self::ModelParams,
157    ) -> Result<(), Box<dyn std::error::Error>>;
158
159    /// Unload the currently loaded model and free associated resources.
160    fn unload_model(&mut self);
161
162    /// Transcribe audio samples directly.
163    ///
164    /// # Arguments
165    ///
166    /// * `samples` - Audio samples as f32 values (16kHz, mono)
167    /// * `params` - Optional engine-specific inference parameters
168    ///
169    /// # Returns
170    ///
171    /// Returns transcription result with text and timing information.
172    fn transcribe_samples(
173        &mut self,
174        samples: Vec<f32>,
175        params: Option<Self::InferenceParams>,
176    ) -> Result<TranscriptionResult, Box<dyn std::error::Error>>;
177
178    /// Transcribe audio from a WAV file.
179    ///
180    /// The WAV file must meet the following requirements:
181    /// - 16 kHz sample rate
182    /// - 16-bit samples
183    /// - Mono (single channel)
184    /// - PCM format
185    ///
186    /// # Arguments
187    ///
188    /// * `wav_path` - Path to the WAV file to transcribe
189    /// * `params` - Optional engine-specific inference parameters
190    ///
191    /// # Returns
192    ///
193    /// Returns transcription result with text and timing information.
194    fn transcribe_file(
195        &mut self,
196        wav_path: &Path,
197        params: Option<Self::InferenceParams>,
198    ) -> Result<TranscriptionResult, Box<dyn std::error::Error>> {
199        let samples = audio::read_wav_samples(wav_path)?;
200        self.transcribe_samples(samples, params)
201    }
202}