Skip to main content

typecast_rust/
models.rs

1//! Data models for the Typecast API
2//!
3//! This module contains all the data structures used for API requests and responses.
4
5use serde::{Deserialize, Serialize};
6
7/// TTS model version to use for speech synthesis
8#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
9pub enum TTSModel {
10    /// Latest model with improved prosody and additional emotion presets (recommended)
11    #[serde(rename = "ssfm-v30")]
12    SsfmV30,
13    /// Stable production model with proven reliability and consistent quality
14    #[serde(rename = "ssfm-v21")]
15    SsfmV21,
16}
17
18impl Default for TTSModel {
19    fn default() -> Self {
20        TTSModel::SsfmV30
21    }
22}
23
24/// Emotion preset types for speech synthesis
25#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
26#[serde(rename_all = "lowercase")]
27pub enum EmotionPreset {
28    /// Neutral, balanced tone
29    Normal,
30    /// Bright, cheerful expression
31    Happy,
32    /// Melancholic, subdued tone
33    Sad,
34    /// Strong, intense delivery
35    Angry,
36    /// Soft, quiet speech (ssfm-v30 only)
37    Whisper,
38    /// Higher tonal emphasis (ssfm-v30 only)
39    #[serde(rename = "toneup")]
40    ToneUp,
41    /// Lower tonal emphasis (ssfm-v30 only)
42    #[serde(rename = "tonedown")]
43    ToneDown,
44}
45
46impl Default for EmotionPreset {
47    fn default() -> Self {
48        EmotionPreset::Normal
49    }
50}
51
52/// Audio output format
53#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
54#[serde(rename_all = "lowercase")]
55pub enum AudioFormat {
56    /// Uncompressed PCM audio (16-bit depth, mono, 44100 Hz)
57    Wav,
58    /// Compressed MPEG Layer III audio (320 kbps, 44100 Hz)
59    Mp3,
60}
61
62impl Default for AudioFormat {
63    fn default() -> Self {
64        AudioFormat::Wav
65    }
66}
67
68/// Gender classification for voices
69#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
70#[serde(rename_all = "lowercase")]
71pub enum Gender {
72    Male,
73    Female,
74}
75
76/// Age group classification for voices
77#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
78#[serde(rename_all = "snake_case")]
79pub enum Age {
80    /// Child voice (under 12 years old)
81    Child,
82    /// Teenage voice (13-19 years old)
83    Teenager,
84    /// Young adult voice (20-35 years old)
85    YoungAdult,
86    /// Middle-aged voice (36-60 years old)
87    MiddleAge,
88    /// Elder voice (over 60 years old)
89    Elder,
90}
91
92/// Voice use case categories
93#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
94pub enum UseCase {
95    Announcer,
96    Anime,
97    Audiobook,
98    Conversational,
99    Documentary,
100    #[serde(rename = "E-learning")]
101    ELearning,
102    Rapper,
103    Game,
104    #[serde(rename = "Tiktok/Reels")]
105    TikTokReels,
106    News,
107    Podcast,
108    Voicemail,
109    Ads,
110}
111
112/// Audio output settings
113#[derive(Debug, Clone, Default, Serialize, Deserialize)]
114pub struct Output {
115    /// Volume level (0-200, default: 100)
116    #[serde(skip_serializing_if = "Option::is_none")]
117    pub volume: Option<i32>,
118    /// Pitch adjustment in semitones (-12 to +12, default: 0)
119    #[serde(skip_serializing_if = "Option::is_none")]
120    pub audio_pitch: Option<i32>,
121    /// Speech speed multiplier (0.5 to 2.0, default: 1.0)
122    #[serde(skip_serializing_if = "Option::is_none")]
123    pub audio_tempo: Option<f64>,
124    /// Output audio format (wav or mp3, default: wav)
125    #[serde(skip_serializing_if = "Option::is_none")]
126    pub audio_format: Option<AudioFormat>,
127}
128
129impl Output {
130    /// Create a new Output with default values
131    pub fn new() -> Self {
132        Self::default()
133    }
134
135    /// Set the volume (0-200)
136    pub fn volume(mut self, volume: i32) -> Self {
137        self.volume = Some(volume.clamp(0, 200));
138        self
139    }
140
141    /// Set the audio pitch (-12 to +12 semitones)
142    pub fn audio_pitch(mut self, pitch: i32) -> Self {
143        self.audio_pitch = Some(pitch.clamp(-12, 12));
144        self
145    }
146
147    /// Set the audio tempo (0.5 to 2.0)
148    pub fn audio_tempo(mut self, tempo: f64) -> Self {
149        self.audio_tempo = Some(tempo.clamp(0.5, 2.0));
150        self
151    }
152
153    /// Set the audio format
154    pub fn audio_format(mut self, format: AudioFormat) -> Self {
155        self.audio_format = Some(format);
156        self
157    }
158}
159
160/// Emotion settings for ssfm-v21 model
161#[derive(Debug, Clone, Default, Serialize, Deserialize)]
162pub struct Prompt {
163    /// Emotion preset to apply
164    #[serde(skip_serializing_if = "Option::is_none")]
165    pub emotion_preset: Option<EmotionPreset>,
166    /// Emotion intensity (0.0 to 2.0, default: 1.0)
167    #[serde(skip_serializing_if = "Option::is_none")]
168    pub emotion_intensity: Option<f64>,
169}
170
171impl Prompt {
172    /// Create a new Prompt with default values
173    pub fn new() -> Self {
174        Self::default()
175    }
176
177    /// Set the emotion preset
178    pub fn emotion_preset(mut self, preset: EmotionPreset) -> Self {
179        self.emotion_preset = Some(preset);
180        self
181    }
182
183    /// Set the emotion intensity (0.0 to 2.0)
184    pub fn emotion_intensity(mut self, intensity: f64) -> Self {
185        self.emotion_intensity = Some(intensity.clamp(0.0, 2.0));
186        self
187    }
188}
189
190/// Preset-based emotion control for ssfm-v30 model
191#[derive(Debug, Clone, Serialize, Deserialize)]
192pub struct PresetPrompt {
193    /// Must be "preset" for preset-based emotion control
194    pub emotion_type: String,
195    /// Emotion preset to apply
196    #[serde(skip_serializing_if = "Option::is_none")]
197    pub emotion_preset: Option<EmotionPreset>,
198    /// Emotion intensity (0.0 to 2.0, default: 1.0)
199    #[serde(skip_serializing_if = "Option::is_none")]
200    pub emotion_intensity: Option<f64>,
201}
202
203impl Default for PresetPrompt {
204    fn default() -> Self {
205        Self {
206            emotion_type: "preset".to_string(),
207            emotion_preset: None,
208            emotion_intensity: None,
209        }
210    }
211}
212
213impl PresetPrompt {
214    /// Create a new PresetPrompt
215    pub fn new() -> Self {
216        Self::default()
217    }
218
219    /// Set the emotion preset
220    pub fn emotion_preset(mut self, preset: EmotionPreset) -> Self {
221        self.emotion_preset = Some(preset);
222        self
223    }
224
225    /// Set the emotion intensity (0.0 to 2.0)
226    pub fn emotion_intensity(mut self, intensity: f64) -> Self {
227        self.emotion_intensity = Some(intensity.clamp(0.0, 2.0));
228        self
229    }
230}
231
232/// Context-aware emotion inference for ssfm-v30 model
233#[derive(Debug, Clone, Serialize, Deserialize)]
234pub struct SmartPrompt {
235    /// Must be "smart" for context-aware emotion inference
236    pub emotion_type: String,
237    /// Text that comes before the main text (max 2000 chars)
238    #[serde(skip_serializing_if = "Option::is_none")]
239    pub previous_text: Option<String>,
240    /// Text that comes after the main text (max 2000 chars)
241    #[serde(skip_serializing_if = "Option::is_none")]
242    pub next_text: Option<String>,
243}
244
245impl Default for SmartPrompt {
246    fn default() -> Self {
247        Self {
248            emotion_type: "smart".to_string(),
249            previous_text: None,
250            next_text: None,
251        }
252    }
253}
254
255impl SmartPrompt {
256    /// Create a new SmartPrompt
257    pub fn new() -> Self {
258        Self::default()
259    }
260
261    /// Set the previous text for context
262    pub fn previous_text(mut self, text: impl Into<String>) -> Self {
263        self.previous_text = Some(text.into());
264        self
265    }
266
267    /// Set the next text for context
268    pub fn next_text(mut self, text: impl Into<String>) -> Self {
269        self.next_text = Some(text.into());
270        self
271    }
272}
273
274/// Union type for all prompt types
275#[derive(Debug, Clone, Serialize, Deserialize)]
276#[serde(untagged)]
277pub enum TTSPrompt {
278    /// Basic emotion control (ssfm-v21 compatible)
279    Basic(Prompt),
280    /// Explicit preset emotion control (ssfm-v30)
281    Preset(PresetPrompt),
282    /// Context-aware emotion inference (ssfm-v30)
283    Smart(SmartPrompt),
284}
285
286impl From<Prompt> for TTSPrompt {
287    fn from(prompt: Prompt) -> Self {
288        TTSPrompt::Basic(prompt)
289    }
290}
291
292impl From<PresetPrompt> for TTSPrompt {
293    fn from(prompt: PresetPrompt) -> Self {
294        TTSPrompt::Preset(prompt)
295    }
296}
297
298impl From<SmartPrompt> for TTSPrompt {
299    fn from(prompt: SmartPrompt) -> Self {
300        TTSPrompt::Smart(prompt)
301    }
302}
303
304/// Text-to-Speech request parameters
305#[derive(Debug, Clone, Serialize, Deserialize)]
306pub struct TTSRequest {
307    /// Voice ID in format 'tc_' followed by a unique identifier
308    pub voice_id: String,
309    /// Text to convert to speech (max 2000 chars)
310    pub text: String,
311    /// TTS model to use
312    pub model: TTSModel,
313    /// Language code (ISO 639-3). Auto-detected if not provided
314    #[serde(skip_serializing_if = "Option::is_none")]
315    pub language: Option<String>,
316    /// Emotion and style settings
317    #[serde(skip_serializing_if = "Option::is_none")]
318    pub prompt: Option<TTSPrompt>,
319    /// Audio output settings
320    #[serde(skip_serializing_if = "Option::is_none")]
321    pub output: Option<Output>,
322    /// Random seed for reproducible results
323    #[serde(skip_serializing_if = "Option::is_none")]
324    pub seed: Option<i32>,
325}
326
327impl TTSRequest {
328    /// Create a new TTSRequest with required fields
329    pub fn new(voice_id: impl Into<String>, text: impl Into<String>, model: TTSModel) -> Self {
330        Self {
331            voice_id: voice_id.into(),
332            text: text.into(),
333            model,
334            language: None,
335            prompt: None,
336            output: None,
337            seed: None,
338        }
339    }
340
341    /// Set the language code (ISO 639-3)
342    pub fn language(mut self, language: impl Into<String>) -> Self {
343        self.language = Some(language.into());
344        self
345    }
346
347    /// Set the prompt (emotion settings)
348    pub fn prompt(mut self, prompt: impl Into<TTSPrompt>) -> Self {
349        self.prompt = Some(prompt.into());
350        self
351    }
352
353    /// Set the output settings
354    pub fn output(mut self, output: Output) -> Self {
355        self.output = Some(output);
356        self
357    }
358
359    /// Set the random seed for reproducible results
360    pub fn seed(mut self, seed: i32) -> Self {
361        self.seed = Some(seed);
362        self
363    }
364}
365
366/// Text-to-Speech response
367#[derive(Debug, Clone)]
368pub struct TTSResponse {
369    /// Generated audio data
370    pub audio_data: Vec<u8>,
371    /// Audio duration in seconds
372    pub duration: f64,
373    /// Audio format (wav or mp3)
374    pub format: AudioFormat,
375}
376
377/// Model information with supported emotions
378#[derive(Debug, Clone, Serialize, Deserialize)]
379pub struct ModelInfo {
380    /// TTS model version
381    pub version: TTSModel,
382    /// List of supported emotions for this model
383    pub emotions: Vec<String>,
384}
385
386/// Voice from V2 API with enhanced metadata
387#[derive(Debug, Clone, Serialize, Deserialize)]
388pub struct VoiceV2 {
389    /// Unique voice identifier
390    pub voice_id: String,
391    /// Human-readable name of the voice
392    pub voice_name: String,
393    /// List of supported TTS models with their emotions
394    pub models: Vec<ModelInfo>,
395    /// Voice gender classification
396    #[serde(skip_serializing_if = "Option::is_none")]
397    pub gender: Option<Gender>,
398    /// Voice age group classification
399    #[serde(skip_serializing_if = "Option::is_none")]
400    pub age: Option<Age>,
401    /// List of use case categories
402    #[serde(skip_serializing_if = "Option::is_none")]
403    pub use_cases: Option<Vec<String>>,
404}
405
406/// Filter options for V2 voices endpoint
407#[derive(Debug, Clone, Default)]
408pub struct VoicesV2Filter {
409    /// Filter by TTS model
410    pub model: Option<TTSModel>,
411    /// Filter by gender
412    pub gender: Option<Gender>,
413    /// Filter by age group
414    pub age: Option<Age>,
415    /// Filter by use case
416    pub use_cases: Option<UseCase>,
417}
418
419impl VoicesV2Filter {
420    /// Create a new empty filter
421    pub fn new() -> Self {
422        Self::default()
423    }
424
425    /// Filter by model
426    pub fn model(mut self, model: TTSModel) -> Self {
427        self.model = Some(model);
428        self
429    }
430
431    /// Filter by gender
432    pub fn gender(mut self, gender: Gender) -> Self {
433        self.gender = Some(gender);
434        self
435    }
436
437    /// Filter by age
438    pub fn age(mut self, age: Age) -> Self {
439        self.age = Some(age);
440        self
441    }
442
443    /// Filter by use case
444    pub fn use_cases(mut self, use_case: UseCase) -> Self {
445        self.use_cases = Some(use_case);
446        self
447    }
448}
449
450/// API error response
451#[derive(Debug, Clone, Serialize, Deserialize)]
452pub struct ErrorResponse {
453    /// Error message describing the issue
454    pub detail: String,
455}