pub mod accel;
pub mod audio;
pub mod error;
pub use accel::{
get_ort_accelerator, get_whisper_accelerator, get_whisper_gpu_device, set_ort_accelerator,
set_whisper_accelerator, set_whisper_gpu_device, OrtAccelerator, WhisperAccelerator,
GPU_DEVICE_AUTO,
};
pub use error::TranscribeError;
#[cfg(feature = "audio-features")]
pub mod decode;
#[cfg(feature = "audio-features")]
pub mod features;
#[cfg(feature = "onnx")]
pub mod onnx;
pub mod transcriber;
pub mod vad;
#[cfg(feature = "whisper-cpp")]
pub mod whisper_cpp;
#[cfg(feature = "whisperfile")]
pub mod whisperfile;
#[cfg(feature = "openai")]
pub mod remote;
#[cfg(feature = "openai")]
pub use remote::RemoteTranscriptionEngine;
use std::path::Path;
#[derive(Debug, Clone)]
pub struct ModelCapabilities {
pub name: &'static str,
pub engine_id: &'static str,
pub sample_rate: u32,
pub languages: &'static [&'static str],
pub supports_timestamps: bool,
pub supports_translation: bool,
pub supports_streaming: bool,
}
#[derive(Debug, Clone, Default)]
pub struct TranscribeOptions {
pub language: Option<String>,
pub translate: bool,
pub leading_silence_ms: Option<u32>,
pub trailing_silence_ms: Option<u32>,
}
pub trait SpeechModel: Send {
fn capabilities(&self) -> ModelCapabilities;
fn default_leading_silence_ms(&self) -> u32 {
0
}
fn default_trailing_silence_ms(&self) -> u32 {
0
}
fn transcribe_raw(
&mut self,
samples: &[f32],
options: &TranscribeOptions,
) -> Result<TranscriptionResult, TranscribeError>;
fn transcribe(
&mut self,
samples: &[f32],
options: &TranscribeOptions,
) -> Result<TranscriptionResult, TranscribeError> {
let lead_ms = options
.leading_silence_ms
.unwrap_or_else(|| self.default_leading_silence_ms());
let trail_ms = options
.trailing_silence_ms
.unwrap_or_else(|| self.default_trailing_silence_ms());
if lead_ms == 0 && trail_ms == 0 {
return self.transcribe_raw(samples, options);
}
let mut buf = if lead_ms > 0 {
audio::prepend_silence(samples, lead_ms)
} else {
samples.to_vec()
};
if trail_ms > 0 {
let trail_len = trail_ms as usize * audio::SAMPLES_PER_MS;
buf.resize(buf.len() + trail_len, 0.0);
}
let mut result = self.transcribe_raw(&buf, options)?;
if lead_ms > 0 {
result.offset_timestamps(-(lead_ms as f32 / 1000.0));
}
Ok(result)
}
fn transcribe_file(
&mut self,
wav_path: &Path,
options: &TranscribeOptions,
) -> Result<TranscriptionResult, TranscribeError> {
let samples = audio::read_wav_samples(wav_path)?;
self.transcribe(&samples, options)
}
}
#[derive(Debug, Clone)]
pub struct TranscriptionResult {
pub text: String,
pub segments: Option<Vec<TranscriptionSegment>>,
}
impl TranscriptionResult {
pub fn offset_timestamps(&mut self, offset_secs: f32) {
if let Some(segs) = &mut self.segments {
for seg in segs {
seg.start = (seg.start + offset_secs).max(0.0);
seg.end = (seg.end + offset_secs).max(0.0);
}
}
}
}
#[derive(Debug, Clone)]
pub struct TranscriptionSegment {
pub start: f32,
pub end: f32,
pub text: String,
}