typecast-rust 0.1.2

Official Rust SDK for Typecast Text-to-Speech API
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
//! Data models for the Typecast API
//!
//! This module contains all the data structures used for API requests and responses.

use serde::{Deserialize, Serialize};

/// TTS model version to use for speech synthesis
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum TTSModel {
    /// Latest model with improved prosody and additional emotion presets (recommended)
    #[serde(rename = "ssfm-v30")]
    SsfmV30,
    /// Stable production model with proven reliability and consistent quality
    #[serde(rename = "ssfm-v21")]
    SsfmV21,
}

impl Default for TTSModel {
    fn default() -> Self {
        TTSModel::SsfmV30
    }
}

/// Emotion preset types for speech synthesis
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum EmotionPreset {
    /// Neutral, balanced tone
    Normal,
    /// Bright, cheerful expression
    Happy,
    /// Melancholic, subdued tone
    Sad,
    /// Strong, intense delivery
    Angry,
    /// Soft, quiet speech (ssfm-v30 only)
    Whisper,
    /// Higher tonal emphasis (ssfm-v30 only)
    #[serde(rename = "toneup")]
    ToneUp,
    /// Lower tonal emphasis (ssfm-v30 only)
    #[serde(rename = "tonedown")]
    ToneDown,
}

impl Default for EmotionPreset {
    fn default() -> Self {
        EmotionPreset::Normal
    }
}

/// Audio output format
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum AudioFormat {
    /// Uncompressed PCM audio (16-bit depth, mono, 44100 Hz)
    Wav,
    /// Compressed MPEG Layer III audio (320 kbps, 44100 Hz)
    Mp3,
}

impl Default for AudioFormat {
    fn default() -> Self {
        AudioFormat::Wav
    }
}

/// Gender classification for voices
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum Gender {
    Male,
    Female,
}

/// Age group classification for voices
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum Age {
    /// Child voice (under 12 years old)
    Child,
    /// Teenage voice (13-19 years old)
    Teenager,
    /// Young adult voice (20-35 years old)
    YoungAdult,
    /// Middle-aged voice (36-60 years old)
    MiddleAge,
    /// Elder voice (over 60 years old)
    Elder,
}

/// Voice use case categories
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum UseCase {
    Announcer,
    Anime,
    Audiobook,
    Conversational,
    Documentary,
    #[serde(rename = "E-learning")]
    ELearning,
    Rapper,
    Game,
    #[serde(rename = "Tiktok/Reels")]
    TikTokReels,
    News,
    Podcast,
    Voicemail,
    Ads,
}

/// Audio output settings
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct Output {
    /// Volume level (0-200, default: 100).
    /// Cannot be used simultaneously with target_lufs.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub volume: Option<i32>,
    /// Target loudness in LUFS for absolute loudness normalization (-70 to 0).
    /// Cannot be used simultaneously with volume.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub target_lufs: Option<f64>,
    /// Pitch adjustment in semitones (-12 to +12, default: 0)
    #[serde(skip_serializing_if = "Option::is_none")]
    pub audio_pitch: Option<i32>,
    /// Speech speed multiplier (0.5 to 2.0, default: 1.0)
    #[serde(skip_serializing_if = "Option::is_none")]
    pub audio_tempo: Option<f64>,
    /// Output audio format (wav or mp3, default: wav)
    #[serde(skip_serializing_if = "Option::is_none")]
    pub audio_format: Option<AudioFormat>,
}

impl Output {
    /// Create a new Output with default values
    pub fn new() -> Self {
        Self::default()
    }

    /// Set the volume (0-200).
    /// Cannot be used simultaneously with target_lufs.
    pub fn volume(mut self, volume: i32) -> Self {
        self.volume = Some(volume.clamp(0, 200));
        self
    }

    /// Set the target LUFS (-70 to 0)
    pub fn target_lufs(mut self, lufs: f64) -> Self {
        self.target_lufs = Some(lufs.clamp(-70.0, 0.0));
        self
    }

    /// Set the audio pitch (-12 to +12 semitones)
    pub fn audio_pitch(mut self, pitch: i32) -> Self {
        self.audio_pitch = Some(pitch.clamp(-12, 12));
        self
    }

    /// Set the audio tempo (0.5 to 2.0)
    pub fn audio_tempo(mut self, tempo: f64) -> Self {
        self.audio_tempo = Some(tempo.clamp(0.5, 2.0));
        self
    }

    /// Set the audio format
    pub fn audio_format(mut self, format: AudioFormat) -> Self {
        self.audio_format = Some(format);
        self
    }
}

/// Emotion settings for ssfm-v21 model
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct Prompt {
    /// Emotion preset to apply
    #[serde(skip_serializing_if = "Option::is_none")]
    pub emotion_preset: Option<EmotionPreset>,
    /// Emotion intensity (0.0 to 2.0, default: 1.0)
    #[serde(skip_serializing_if = "Option::is_none")]
    pub emotion_intensity: Option<f64>,
}

impl Prompt {
    /// Create a new Prompt with default values
    pub fn new() -> Self {
        Self::default()
    }

    /// Set the emotion preset
    pub fn emotion_preset(mut self, preset: EmotionPreset) -> Self {
        self.emotion_preset = Some(preset);
        self
    }

    /// Set the emotion intensity (0.0 to 2.0)
    pub fn emotion_intensity(mut self, intensity: f64) -> Self {
        self.emotion_intensity = Some(intensity.clamp(0.0, 2.0));
        self
    }
}

/// Preset-based emotion control for ssfm-v30 model
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PresetPrompt {
    /// Must be "preset" for preset-based emotion control
    pub emotion_type: String,
    /// Emotion preset to apply
    #[serde(skip_serializing_if = "Option::is_none")]
    pub emotion_preset: Option<EmotionPreset>,
    /// Emotion intensity (0.0 to 2.0, default: 1.0)
    #[serde(skip_serializing_if = "Option::is_none")]
    pub emotion_intensity: Option<f64>,
}

impl Default for PresetPrompt {
    fn default() -> Self {
        Self {
            emotion_type: "preset".to_string(),
            emotion_preset: None,
            emotion_intensity: None,
        }
    }
}

impl PresetPrompt {
    /// Create a new PresetPrompt
    pub fn new() -> Self {
        Self::default()
    }

    /// Set the emotion preset
    pub fn emotion_preset(mut self, preset: EmotionPreset) -> Self {
        self.emotion_preset = Some(preset);
        self
    }

    /// Set the emotion intensity (0.0 to 2.0)
    pub fn emotion_intensity(mut self, intensity: f64) -> Self {
        self.emotion_intensity = Some(intensity.clamp(0.0, 2.0));
        self
    }
}

/// Context-aware emotion inference for ssfm-v30 model
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SmartPrompt {
    /// Must be "smart" for context-aware emotion inference
    pub emotion_type: String,
    /// Text that comes before the main text (max 2000 chars)
    #[serde(skip_serializing_if = "Option::is_none")]
    pub previous_text: Option<String>,
    /// Text that comes after the main text (max 2000 chars)
    #[serde(skip_serializing_if = "Option::is_none")]
    pub next_text: Option<String>,
}

impl Default for SmartPrompt {
    fn default() -> Self {
        Self {
            emotion_type: "smart".to_string(),
            previous_text: None,
            next_text: None,
        }
    }
}

impl SmartPrompt {
    /// Create a new SmartPrompt
    pub fn new() -> Self {
        Self::default()
    }

    /// Set the previous text for context
    pub fn previous_text(mut self, text: impl Into<String>) -> Self {
        self.previous_text = Some(text.into());
        self
    }

    /// Set the next text for context
    pub fn next_text(mut self, text: impl Into<String>) -> Self {
        self.next_text = Some(text.into());
        self
    }
}

/// Union type for all prompt types
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum TTSPrompt {
    /// Basic emotion control (ssfm-v21 compatible)
    Basic(Prompt),
    /// Explicit preset emotion control (ssfm-v30)
    Preset(PresetPrompt),
    /// Context-aware emotion inference (ssfm-v30)
    Smart(SmartPrompt),
}

impl From<Prompt> for TTSPrompt {
    fn from(prompt: Prompt) -> Self {
        TTSPrompt::Basic(prompt)
    }
}

impl From<PresetPrompt> for TTSPrompt {
    fn from(prompt: PresetPrompt) -> Self {
        TTSPrompt::Preset(prompt)
    }
}

impl From<SmartPrompt> for TTSPrompt {
    fn from(prompt: SmartPrompt) -> Self {
        TTSPrompt::Smart(prompt)
    }
}

/// Text-to-Speech request parameters
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TTSRequest {
    /// Voice ID in format 'tc_' followed by a unique identifier
    pub voice_id: String,
    /// Text to convert to speech (max 2000 chars)
    pub text: String,
    /// TTS model to use
    pub model: TTSModel,
    /// Language code (ISO 639-3). Auto-detected if not provided
    #[serde(skip_serializing_if = "Option::is_none")]
    pub language: Option<String>,
    /// Emotion and style settings
    #[serde(skip_serializing_if = "Option::is_none")]
    pub prompt: Option<TTSPrompt>,
    /// Audio output settings
    #[serde(skip_serializing_if = "Option::is_none")]
    pub output: Option<Output>,
    /// Random seed for reproducible results
    #[serde(skip_serializing_if = "Option::is_none")]
    pub seed: Option<i32>,
}

impl TTSRequest {
    /// Create a new TTSRequest with required fields
    pub fn new(voice_id: impl Into<String>, text: impl Into<String>, model: TTSModel) -> Self {
        Self {
            voice_id: voice_id.into(),
            text: text.into(),
            model,
            language: None,
            prompt: None,
            output: None,
            seed: None,
        }
    }

    /// Set the language code (ISO 639-3)
    pub fn language(mut self, language: impl Into<String>) -> Self {
        self.language = Some(language.into());
        self
    }

    /// Set the prompt (emotion settings)
    pub fn prompt(mut self, prompt: impl Into<TTSPrompt>) -> Self {
        self.prompt = Some(prompt.into());
        self
    }

    /// Set the output settings
    pub fn output(mut self, output: Output) -> Self {
        self.output = Some(output);
        self
    }

    /// Set the random seed for reproducible results
    pub fn seed(mut self, seed: i32) -> Self {
        self.seed = Some(seed);
        self
    }
}

/// Text-to-Speech response
#[derive(Debug, Clone)]
pub struct TTSResponse {
    /// Generated audio data
    pub audio_data: Vec<u8>,
    /// Audio duration in seconds
    pub duration: f64,
    /// Audio format (wav or mp3)
    pub format: AudioFormat,
}

/// Model information with supported emotions
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelInfo {
    /// TTS model version
    pub version: TTSModel,
    /// List of supported emotions for this model
    pub emotions: Vec<String>,
}

/// Voice from V2 API with enhanced metadata
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VoiceV2 {
    /// Unique voice identifier
    pub voice_id: String,
    /// Human-readable name of the voice
    pub voice_name: String,
    /// List of supported TTS models with their emotions
    pub models: Vec<ModelInfo>,
    /// Voice gender classification
    #[serde(skip_serializing_if = "Option::is_none")]
    pub gender: Option<Gender>,
    /// Voice age group classification
    #[serde(skip_serializing_if = "Option::is_none")]
    pub age: Option<Age>,
    /// List of use case categories
    #[serde(skip_serializing_if = "Option::is_none")]
    pub use_cases: Option<Vec<String>>,
}

/// Filter options for V2 voices endpoint
#[derive(Debug, Clone, Default)]
pub struct VoicesV2Filter {
    /// Filter by TTS model
    pub model: Option<TTSModel>,
    /// Filter by gender
    pub gender: Option<Gender>,
    /// Filter by age group
    pub age: Option<Age>,
    /// Filter by use case
    pub use_cases: Option<UseCase>,
}

impl VoicesV2Filter {
    /// Create a new empty filter
    pub fn new() -> Self {
        Self::default()
    }

    /// Filter by model
    pub fn model(mut self, model: TTSModel) -> Self {
        self.model = Some(model);
        self
    }

    /// Filter by gender
    pub fn gender(mut self, gender: Gender) -> Self {
        self.gender = Some(gender);
        self
    }

    /// Filter by age
    pub fn age(mut self, age: Age) -> Self {
        self.age = Some(age);
        self
    }

    /// Filter by use case
    pub fn use_cases(mut self, use_case: UseCase) -> Self {
        self.use_cases = Some(use_case);
        self
    }
}

/// API error response
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ErrorResponse {
    /// Error message describing the issue
    pub detail: String,
}