adk_audio/traits/stt.rs
1//! Speech-to-text provider trait and response types.
2
3use std::pin::Pin;
4
5use async_trait::async_trait;
6use futures::Stream;
7
8use crate::error::AudioResult;
9use crate::frame::AudioFrame;
10
11/// Options for speech-to-text transcription.
12#[derive(Debug, Clone, Default)]
13pub struct SttOptions {
14 /// Optional BCP-47 language hint.
15 pub language: Option<String>,
16 /// Enable speaker diarization.
17 pub diarize: bool,
18 /// Include per-word timestamps.
19 pub word_timestamps: bool,
20 /// Apply smart formatting (punctuation, casing).
21 pub smart_format: bool,
22 /// Optional model hint for the provider.
23 pub model_hint: Option<String>,
24}
25
26/// A transcription result.
27#[derive(Debug, Clone, Default)]
28pub struct Transcript {
29 /// Full transcribed text.
30 pub text: String,
31 /// Per-word details with timestamps.
32 pub words: Vec<Word>,
33 /// Identified speakers.
34 pub speakers: Vec<Speaker>,
35 /// Overall confidence score (0.0–1.0).
36 pub confidence: f32,
37 /// Detected language (BCP-47).
38 pub language_detected: Option<String>,
39}
40
41/// A single word with timing and confidence.
42#[derive(Debug, Clone)]
43pub struct Word {
44 /// The word text.
45 pub text: String,
46 /// Start time in milliseconds.
47 pub start_ms: u32,
48 /// End time in milliseconds.
49 pub end_ms: u32,
50 /// Word-level confidence (0.0–1.0).
51 pub confidence: f32,
52 /// Speaker ID if diarization is enabled.
53 pub speaker: Option<u32>,
54}
55
56/// An identified speaker.
57#[derive(Debug, Clone)]
58pub struct Speaker {
59 /// Numeric speaker identifier.
60 pub id: u32,
61 /// Optional human-readable label.
62 pub label: Option<String>,
63}
64
65/// Unified trait for speech-to-text providers.
66///
67/// Implementors include cloud services (Whisper API, Deepgram, AssemblyAI)
68/// and local models (MLX Whisper).
69#[async_trait]
70pub trait SttProvider: Send + Sync {
71 /// Transcribe a single audio frame (batch mode).
72 async fn transcribe(&self, audio: &AudioFrame, opts: &SttOptions) -> AudioResult<Transcript>;
73
74 /// Transcribe a stream of audio frames (streaming mode).
75 async fn transcribe_stream(
76 &self,
77 audio: Pin<Box<dyn Stream<Item = AudioFrame> + Send>>,
78 opts: &SttOptions,
79 ) -> AudioResult<Pin<Box<dyn Stream<Item = AudioResult<Transcript>> + Send>>>;
80}