Skip to main content

adk_audio/traits/
tts.rs

1//! Text-to-speech provider trait and request types.
2
3use std::pin::Pin;
4
5use async_trait::async_trait;
6use futures::Stream;
7
8use crate::codec::AudioFormat;
9use crate::error::AudioResult;
10use crate::frame::AudioFrame;
11
12/// Emotion hint for TTS synthesis.
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
14pub enum Emotion {
15    /// Neutral tone.
16    Neutral,
17    /// Happy / upbeat tone.
18    Happy,
19    /// Sad / somber tone.
20    Sad,
21    /// Angry / forceful tone.
22    Angry,
23    /// Whispered / quiet tone.
24    Whisper,
25    /// Excited / energetic tone.
26    Excited,
27    /// Calm / soothing tone.
28    Calm,
29}
30
31/// Descriptor for an available voice.
32#[derive(Debug, Clone)]
33pub struct Voice {
34    /// Provider-specific voice identifier.
35    pub id: String,
36    /// Human-readable voice name.
37    pub name: String,
38    /// BCP-47 language code.
39    pub language: String,
40    /// Optional gender label.
41    pub gender: Option<String>,
42}
43
44/// Request parameters for TTS synthesis.
45#[derive(Debug, Clone)]
46pub struct TtsRequest {
47    /// Text to synthesize.
48    pub text: String,
49    /// Voice identifier.
50    pub voice: String,
51    /// Optional BCP-47 language code.
52    pub language: Option<String>,
53    /// Speaking speed multiplier (0.5–2.0, default 1.0).
54    pub speed: f32,
55    /// Optional pitch adjustment.
56    pub pitch: Option<f32>,
57    /// Optional emotion hint.
58    pub emotion: Option<Emotion>,
59    /// Desired output format (internal use; providers output PCM16).
60    pub output_format: AudioFormat,
61}
62
63impl Default for TtsRequest {
64    fn default() -> Self {
65        Self {
66            text: String::new(),
67            voice: String::new(),
68            language: None,
69            speed: 1.0,
70            pitch: None,
71            emotion: None,
72            output_format: AudioFormat::Pcm16,
73        }
74    }
75}
76
77/// Unified trait for text-to-speech providers.
78///
79/// Implementors include cloud services (ElevenLabs, OpenAI, Gemini, Cartesia)
80/// and local models (MLX Kokoro, ONNX Kokoro).
81#[async_trait]
82pub trait TtsProvider: Send + Sync {
83    /// Synthesize text to a single audio frame (batch mode).
84    async fn synthesize(&self, request: &TtsRequest) -> AudioResult<AudioFrame>;
85
86    /// Synthesize text as a stream of audio frames (streaming mode).
87    async fn synthesize_stream(
88        &self,
89        request: &TtsRequest,
90    ) -> AudioResult<Pin<Box<dyn Stream<Item = AudioResult<AudioFrame>> + Send>>>;
91
92    /// List available voices for this provider.
93    fn voice_catalog(&self) -> &[Voice];
94}