Skip to main content

adk_audio/traits/
stt.rs

1//! Speech-to-text provider trait and response types.
2
3use std::pin::Pin;
4
5use async_trait::async_trait;
6use futures::Stream;
7
8use crate::error::AudioResult;
9use crate::frame::AudioFrame;
10
11/// Options for speech-to-text transcription.
12#[derive(Debug, Clone, Default)]
13pub struct SttOptions {
14    /// Optional BCP-47 language hint.
15    pub language: Option<String>,
16    /// Enable speaker diarization.
17    pub diarize: bool,
18    /// Include per-word timestamps.
19    pub word_timestamps: bool,
20    /// Apply smart formatting (punctuation, casing).
21    pub smart_format: bool,
22    /// Optional model hint for the provider.
23    pub model_hint: Option<String>,
24}
25
26/// A transcription result.
27#[derive(Debug, Clone, Default)]
28pub struct Transcript {
29    /// Full transcribed text.
30    pub text: String,
31    /// Per-word details with timestamps.
32    pub words: Vec<Word>,
33    /// Identified speakers.
34    pub speakers: Vec<Speaker>,
35    /// Overall confidence score (0.0–1.0).
36    pub confidence: f32,
37    /// Detected language (BCP-47).
38    pub language_detected: Option<String>,
39}
40
41/// A single word with timing and confidence.
42#[derive(Debug, Clone)]
43pub struct Word {
44    /// The word text.
45    pub text: String,
46    /// Start time in milliseconds.
47    pub start_ms: u32,
48    /// End time in milliseconds.
49    pub end_ms: u32,
50    /// Word-level confidence (0.0–1.0).
51    pub confidence: f32,
52    /// Speaker ID if diarization is enabled.
53    pub speaker: Option<u32>,
54}
55
56/// An identified speaker.
57#[derive(Debug, Clone)]
58pub struct Speaker {
59    /// Numeric speaker identifier.
60    pub id: u32,
61    /// Optional human-readable label.
62    pub label: Option<String>,
63}
64
65/// Unified trait for speech-to-text providers.
66///
67/// Implementors include cloud services (Whisper API, Deepgram, AssemblyAI)
68/// and local models (MLX Whisper).
69#[async_trait]
70pub trait SttProvider: Send + Sync {
71    /// Transcribe a single audio frame (batch mode).
72    async fn transcribe(&self, audio: &AudioFrame, opts: &SttOptions) -> AudioResult<Transcript>;
73
74    /// Transcribe a stream of audio frames (streaming mode).
75    async fn transcribe_stream(
76        &self,
77        audio: Pin<Box<dyn Stream<Item = AudioFrame> + Send>>,
78        opts: &SttOptions,
79    ) -> AudioResult<Pin<Box<dyn Stream<Item = AudioResult<Transcript>> + Send>>>;
80}