Skip to main content

wavekat_tts/
types.rs

1/// A TTS synthesis request.
2///
3/// Backend-agnostic parameters that describe what to synthesize.
4/// Each backend interprets `voice`, `instruction`, and `language` according to
5/// its own capabilities; unsupported fields are silently ignored.
6#[derive(Debug, Clone)]
7pub struct SynthesizeRequest<'a> {
8    /// Text to synthesize.
9    pub text: &'a str,
10
11    /// Voice identifier (backend-specific).
12    ///
13    /// Used by backends with a fixed speaker catalog:
14    /// - Edge-TTS: `"zh-CN-XiaoxiaoNeural"`, `"zh-CN-YunxiNeural"`, …
15    /// - Kokoro: `"af_heart"`, `"zf_xiaobei"`, …
16    ///
17    /// `None` uses the backend's default voice.
18    pub voice: Option<&'a str>,
19
20    /// Free-form voice instruction / style prompt.
21    ///
22    /// Used by instruction-following backends (e.g. Qwen3-TTS VoiceDesign).
23    /// The text describes how the model should speak:
24    ///
25    /// ```text
26    /// "Speak in a calm, professional tone."
27    /// "Narrate with warmth and a gentle pace."
28    /// "Respond with high energy and enthusiasm!"
29    /// ```
30    ///
31    /// `None` lets the backend use its default voice character.
32    pub instruction: Option<&'a str>,
33
34    /// Language / locale code.
35    ///
36    /// E.g. `"zh"`, `"en"`, `"ja"`.
37    /// `None` uses the backend's default or auto-detects.
38    pub language: Option<&'a str>,
39
40    /// Speed multiplier. `1.0` is normal speed.
41    ///
42    /// Values below 1.0 slow down, above 1.0 speed up.
43    /// Not all backends support this; unsupported values are ignored.
44    pub speed: Option<f32>,
45}
46
47impl<'a> SynthesizeRequest<'a> {
48    /// Create a minimal request with just text.
49    pub fn new(text: &'a str) -> Self {
50        Self {
51            text,
52            voice: None,
53            instruction: None,
54            language: None,
55            speed: None,
56        }
57    }
58
59    /// Set the voice identifier.
60    pub fn with_voice(mut self, voice: &'a str) -> Self {
61        self.voice = Some(voice);
62        self
63    }
64
65    /// Set the voice instruction / style prompt.
66    pub fn with_instruction(mut self, instruction: &'a str) -> Self {
67        self.instruction = Some(instruction);
68        self
69    }
70
71    /// Set the language.
72    pub fn with_language(mut self, language: &'a str) -> Self {
73        self.language = Some(language);
74        self
75    }
76
77    /// Set the speed multiplier.
78    pub fn with_speed(mut self, speed: f32) -> Self {
79        self.speed = Some(speed);
80        self
81    }
82}
83
84/// Metadata about a voice available in a backend.
85#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
86pub struct VoiceInfo {
87    /// Backend-specific voice identifier.
88    pub id: String,
89
90    /// Human-readable display name.
91    pub name: String,
92
93    /// Supported language / locale codes.
94    pub languages: Vec<String>,
95
96    /// Gender hint, if available.
97    pub gender: Option<Gender>,
98}
99
100/// Voice gender hint.
101#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
102pub enum Gender {
103    /// Male voice.
104    Male,
105    /// Female voice.
106    Female,
107    /// Gender-neutral or unspecified voice.
108    Neutral,
109}