adk_audio/traits/tts.rs
1//! Text-to-speech provider trait and request types.
2
3use std::pin::Pin;
4
5use async_trait::async_trait;
6use futures::Stream;
7
8use crate::codec::AudioFormat;
9use crate::error::AudioResult;
10use crate::frame::AudioFrame;
11
12/// Emotion hint for TTS synthesis.
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
14pub enum Emotion {
15 /// Neutral tone.
16 Neutral,
17 /// Happy / upbeat tone.
18 Happy,
19 /// Sad / somber tone.
20 Sad,
21 /// Angry / forceful tone.
22 Angry,
23 /// Whispered / quiet tone.
24 Whisper,
25 /// Excited / energetic tone.
26 Excited,
27 /// Calm / soothing tone.
28 Calm,
29}
30
31/// Descriptor for an available voice.
32#[derive(Debug, Clone)]
33pub struct Voice {
34 /// Provider-specific voice identifier.
35 pub id: String,
36 /// Human-readable voice name.
37 pub name: String,
38 /// BCP-47 language code.
39 pub language: String,
40 /// Optional gender label.
41 pub gender: Option<String>,
42}
43
44/// Request parameters for TTS synthesis.
45#[derive(Debug, Clone)]
46pub struct TtsRequest {
47 /// Text to synthesize.
48 pub text: String,
49 /// Voice identifier.
50 pub voice: String,
51 /// Optional BCP-47 language code.
52 pub language: Option<String>,
53 /// Speaking speed multiplier (0.5–2.0, default 1.0).
54 pub speed: f32,
55 /// Optional pitch adjustment.
56 pub pitch: Option<f32>,
57 /// Optional emotion hint.
58 pub emotion: Option<Emotion>,
59 /// Desired output format (internal use; providers output PCM16).
60 pub output_format: AudioFormat,
61}
62
63impl Default for TtsRequest {
64 fn default() -> Self {
65 Self {
66 text: String::new(),
67 voice: String::new(),
68 language: None,
69 speed: 1.0,
70 pitch: None,
71 emotion: None,
72 output_format: AudioFormat::Pcm16,
73 }
74 }
75}
76
77/// Unified trait for text-to-speech providers.
78///
79/// Implementors include cloud services (ElevenLabs, OpenAI, Gemini, Cartesia)
80/// and local models (MLX Kokoro, ONNX Kokoro).
81#[async_trait]
82pub trait TtsProvider: Send + Sync {
83 /// Synthesize text to a single audio frame (batch mode).
84 async fn synthesize(&self, request: &TtsRequest) -> AudioResult<AudioFrame>;
85
86 /// Synthesize text as a stream of audio frames (streaming mode).
87 async fn synthesize_stream(
88 &self,
89 request: &TtsRequest,
90 ) -> AudioResult<Pin<Box<dyn Stream<Item = AudioResult<AudioFrame>> + Send>>>;
91
92 /// List available voices for this provider.
93 fn voice_catalog(&self) -> &[Voice];
94}