Skip to main content

typecast_rust/
models.rs

1//! Data models for the Typecast API
2//!
3//! This module contains all the data structures used for API requests and responses.
4
5use serde::{Deserialize, Serialize};
6
7/// TTS model version to use for speech synthesis
8#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
9pub enum TTSModel {
10    /// Latest model with improved prosody and additional emotion presets (recommended)
11    #[serde(rename = "ssfm-v30")]
12    SsfmV30,
13    /// Stable production model with proven reliability and consistent quality
14    #[serde(rename = "ssfm-v21")]
15    SsfmV21,
16}
17
18impl Default for TTSModel {
19    fn default() -> Self {
20        TTSModel::SsfmV30
21    }
22}
23
24/// Emotion preset types for speech synthesis
25#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
26#[serde(rename_all = "lowercase")]
27pub enum EmotionPreset {
28    /// Neutral, balanced tone
29    Normal,
30    /// Bright, cheerful expression
31    Happy,
32    /// Melancholic, subdued tone
33    Sad,
34    /// Strong, intense delivery
35    Angry,
36    /// Soft, quiet speech (ssfm-v30 only)
37    Whisper,
38    /// Higher tonal emphasis (ssfm-v30 only)
39    #[serde(rename = "toneup")]
40    ToneUp,
41    /// Lower tonal emphasis (ssfm-v30 only)
42    #[serde(rename = "tonedown")]
43    ToneDown,
44}
45
46impl Default for EmotionPreset {
47    fn default() -> Self {
48        EmotionPreset::Normal
49    }
50}
51
52/// Audio output format
53#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
54#[serde(rename_all = "lowercase")]
55pub enum AudioFormat {
56    /// Uncompressed PCM audio (16-bit depth, mono, 44100 Hz)
57    Wav,
58    /// Compressed MPEG Layer III audio (320 kbps, 44100 Hz)
59    Mp3,
60}
61
62impl Default for AudioFormat {
63    fn default() -> Self {
64        AudioFormat::Wav
65    }
66}
67
68/// Gender classification for voices
69#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
70#[serde(rename_all = "lowercase")]
71pub enum Gender {
72    Male,
73    Female,
74}
75
76/// Age group classification for voices
77#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
78#[serde(rename_all = "snake_case")]
79pub enum Age {
80    /// Child voice (under 12 years old)
81    Child,
82    /// Teenage voice (13-19 years old)
83    Teenager,
84    /// Young adult voice (20-35 years old)
85    YoungAdult,
86    /// Middle-aged voice (36-60 years old)
87    MiddleAge,
88    /// Elder voice (over 60 years old)
89    Elder,
90}
91
92/// Voice use case categories
93#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
94pub enum UseCase {
95    Announcer,
96    Anime,
97    Audiobook,
98    Conversational,
99    Documentary,
100    #[serde(rename = "E-learning")]
101    ELearning,
102    Rapper,
103    Game,
104    #[serde(rename = "Tiktok/Reels")]
105    TikTokReels,
106    News,
107    Podcast,
108    Voicemail,
109    Ads,
110}
111
112/// Audio output settings
113#[derive(Debug, Clone, Default, Serialize, Deserialize)]
114pub struct Output {
115    /// Volume level (0-200, default: 100).
116    /// Cannot be used simultaneously with target_lufs.
117    #[serde(skip_serializing_if = "Option::is_none")]
118    pub volume: Option<i32>,
119    /// Target loudness in LUFS for absolute loudness normalization (-70 to 0).
120    /// Cannot be used simultaneously with volume.
121    #[serde(skip_serializing_if = "Option::is_none")]
122    pub target_lufs: Option<f64>,
123    /// Pitch adjustment in semitones (-12 to +12, default: 0)
124    #[serde(skip_serializing_if = "Option::is_none")]
125    pub audio_pitch: Option<i32>,
126    /// Speech speed multiplier (0.5 to 2.0, default: 1.0)
127    #[serde(skip_serializing_if = "Option::is_none")]
128    pub audio_tempo: Option<f64>,
129    /// Output audio format (wav or mp3, default: wav)
130    #[serde(skip_serializing_if = "Option::is_none")]
131    pub audio_format: Option<AudioFormat>,
132}
133
134impl Output {
135    /// Create a new Output with default values
136    pub fn new() -> Self {
137        Self::default()
138    }
139
140    /// Set the volume (0-200).
141    /// Cannot be used simultaneously with target_lufs.
142    pub fn volume(mut self, volume: i32) -> Self {
143        self.volume = Some(volume.clamp(0, 200));
144        self
145    }
146
147    /// Set the target LUFS (-70 to 0)
148    pub fn target_lufs(mut self, lufs: f64) -> Self {
149        self.target_lufs = Some(lufs.clamp(-70.0, 0.0));
150        self
151    }
152
153    /// Set the audio pitch (-12 to +12 semitones)
154    pub fn audio_pitch(mut self, pitch: i32) -> Self {
155        self.audio_pitch = Some(pitch.clamp(-12, 12));
156        self
157    }
158
159    /// Set the audio tempo (0.5 to 2.0)
160    pub fn audio_tempo(mut self, tempo: f64) -> Self {
161        self.audio_tempo = Some(tempo.clamp(0.5, 2.0));
162        self
163    }
164
165    /// Set the audio format
166    pub fn audio_format(mut self, format: AudioFormat) -> Self {
167        self.audio_format = Some(format);
168        self
169    }
170}
171
172/// Emotion settings for ssfm-v21 model
173#[derive(Debug, Clone, Default, Serialize, Deserialize)]
174pub struct Prompt {
175    /// Emotion preset to apply
176    #[serde(skip_serializing_if = "Option::is_none")]
177    pub emotion_preset: Option<EmotionPreset>,
178    /// Emotion intensity (0.0 to 2.0, default: 1.0)
179    #[serde(skip_serializing_if = "Option::is_none")]
180    pub emotion_intensity: Option<f64>,
181}
182
183impl Prompt {
184    /// Create a new Prompt with default values
185    pub fn new() -> Self {
186        Self::default()
187    }
188
189    /// Set the emotion preset
190    pub fn emotion_preset(mut self, preset: EmotionPreset) -> Self {
191        self.emotion_preset = Some(preset);
192        self
193    }
194
195    /// Set the emotion intensity (0.0 to 2.0)
196    pub fn emotion_intensity(mut self, intensity: f64) -> Self {
197        self.emotion_intensity = Some(intensity.clamp(0.0, 2.0));
198        self
199    }
200}
201
202/// Preset-based emotion control for ssfm-v30 model
203#[derive(Debug, Clone, Serialize, Deserialize)]
204pub struct PresetPrompt {
205    /// Must be "preset" for preset-based emotion control
206    pub emotion_type: String,
207    /// Emotion preset to apply
208    #[serde(skip_serializing_if = "Option::is_none")]
209    pub emotion_preset: Option<EmotionPreset>,
210    /// Emotion intensity (0.0 to 2.0, default: 1.0)
211    #[serde(skip_serializing_if = "Option::is_none")]
212    pub emotion_intensity: Option<f64>,
213}
214
215impl Default for PresetPrompt {
216    fn default() -> Self {
217        Self {
218            emotion_type: "preset".to_string(),
219            emotion_preset: None,
220            emotion_intensity: None,
221        }
222    }
223}
224
225impl PresetPrompt {
226    /// Create a new PresetPrompt
227    pub fn new() -> Self {
228        Self::default()
229    }
230
231    /// Set the emotion preset
232    pub fn emotion_preset(mut self, preset: EmotionPreset) -> Self {
233        self.emotion_preset = Some(preset);
234        self
235    }
236
237    /// Set the emotion intensity (0.0 to 2.0)
238    pub fn emotion_intensity(mut self, intensity: f64) -> Self {
239        self.emotion_intensity = Some(intensity.clamp(0.0, 2.0));
240        self
241    }
242}
243
244/// Context-aware emotion inference for ssfm-v30 model
245#[derive(Debug, Clone, Serialize, Deserialize)]
246pub struct SmartPrompt {
247    /// Must be "smart" for context-aware emotion inference
248    pub emotion_type: String,
249    /// Text that comes before the main text (max 2000 chars)
250    #[serde(skip_serializing_if = "Option::is_none")]
251    pub previous_text: Option<String>,
252    /// Text that comes after the main text (max 2000 chars)
253    #[serde(skip_serializing_if = "Option::is_none")]
254    pub next_text: Option<String>,
255}
256
257impl Default for SmartPrompt {
258    fn default() -> Self {
259        Self {
260            emotion_type: "smart".to_string(),
261            previous_text: None,
262            next_text: None,
263        }
264    }
265}
266
267impl SmartPrompt {
268    /// Create a new SmartPrompt
269    pub fn new() -> Self {
270        Self::default()
271    }
272
273    /// Set the previous text for context
274    pub fn previous_text(mut self, text: impl Into<String>) -> Self {
275        self.previous_text = Some(text.into());
276        self
277    }
278
279    /// Set the next text for context
280    pub fn next_text(mut self, text: impl Into<String>) -> Self {
281        self.next_text = Some(text.into());
282        self
283    }
284}
285
286/// Union type for all prompt types
287#[derive(Debug, Clone, Serialize, Deserialize)]
288#[serde(untagged)]
289pub enum TTSPrompt {
290    /// Basic emotion control (ssfm-v21 compatible)
291    Basic(Prompt),
292    /// Explicit preset emotion control (ssfm-v30)
293    Preset(PresetPrompt),
294    /// Context-aware emotion inference (ssfm-v30)
295    Smart(SmartPrompt),
296}
297
298impl From<Prompt> for TTSPrompt {
299    fn from(prompt: Prompt) -> Self {
300        TTSPrompt::Basic(prompt)
301    }
302}
303
304impl From<PresetPrompt> for TTSPrompt {
305    fn from(prompt: PresetPrompt) -> Self {
306        TTSPrompt::Preset(prompt)
307    }
308}
309
310impl From<SmartPrompt> for TTSPrompt {
311    fn from(prompt: SmartPrompt) -> Self {
312        TTSPrompt::Smart(prompt)
313    }
314}
315
316/// Text-to-Speech request parameters
317#[derive(Debug, Clone, Serialize, Deserialize)]
318pub struct TTSRequest {
319    /// Voice ID in format 'tc_' followed by a unique identifier
320    pub voice_id: String,
321    /// Text to convert to speech (max 2000 chars)
322    pub text: String,
323    /// TTS model to use
324    pub model: TTSModel,
325    /// Language code (ISO 639-3). Auto-detected if not provided
326    #[serde(skip_serializing_if = "Option::is_none")]
327    pub language: Option<String>,
328    /// Emotion and style settings
329    #[serde(skip_serializing_if = "Option::is_none")]
330    pub prompt: Option<TTSPrompt>,
331    /// Audio output settings
332    #[serde(skip_serializing_if = "Option::is_none")]
333    pub output: Option<Output>,
334    /// Random seed for reproducible results
335    #[serde(skip_serializing_if = "Option::is_none")]
336    pub seed: Option<i32>,
337}
338
339impl TTSRequest {
340    /// Create a new TTSRequest with required fields
341    pub fn new(voice_id: impl Into<String>, text: impl Into<String>, model: TTSModel) -> Self {
342        Self {
343            voice_id: voice_id.into(),
344            text: text.into(),
345            model,
346            language: None,
347            prompt: None,
348            output: None,
349            seed: None,
350        }
351    }
352
353    /// Set the language code (ISO 639-3)
354    pub fn language(mut self, language: impl Into<String>) -> Self {
355        self.language = Some(language.into());
356        self
357    }
358
359    /// Set the prompt (emotion settings)
360    pub fn prompt(mut self, prompt: impl Into<TTSPrompt>) -> Self {
361        self.prompt = Some(prompt.into());
362        self
363    }
364
365    /// Set the output settings
366    pub fn output(mut self, output: Output) -> Self {
367        self.output = Some(output);
368        self
369    }
370
371    /// Set the random seed for reproducible results
372    pub fn seed(mut self, seed: i32) -> Self {
373        self.seed = Some(seed);
374        self
375    }
376}
377
378/// Text-to-Speech response
379#[derive(Debug, Clone)]
380pub struct TTSResponse {
381    /// Generated audio data
382    pub audio_data: Vec<u8>,
383    /// Audio duration in seconds
384    pub duration: f64,
385    /// Audio format (wav or mp3)
386    pub format: AudioFormat,
387}
388
389/// Model information with supported emotions
390#[derive(Debug, Clone, Serialize, Deserialize)]
391pub struct ModelInfo {
392    /// TTS model version
393    pub version: TTSModel,
394    /// List of supported emotions for this model
395    pub emotions: Vec<String>,
396}
397
398/// Voice from V2 API with enhanced metadata
399#[derive(Debug, Clone, Serialize, Deserialize)]
400pub struct VoiceV2 {
401    /// Unique voice identifier
402    pub voice_id: String,
403    /// Human-readable name of the voice
404    pub voice_name: String,
405    /// List of supported TTS models with their emotions
406    pub models: Vec<ModelInfo>,
407    /// Voice gender classification
408    #[serde(skip_serializing_if = "Option::is_none")]
409    pub gender: Option<Gender>,
410    /// Voice age group classification
411    #[serde(skip_serializing_if = "Option::is_none")]
412    pub age: Option<Age>,
413    /// List of use case categories
414    #[serde(skip_serializing_if = "Option::is_none")]
415    pub use_cases: Option<Vec<String>>,
416}
417
418/// Filter options for V2 voices endpoint
419#[derive(Debug, Clone, Default)]
420pub struct VoicesV2Filter {
421    /// Filter by TTS model
422    pub model: Option<TTSModel>,
423    /// Filter by gender
424    pub gender: Option<Gender>,
425    /// Filter by age group
426    pub age: Option<Age>,
427    /// Filter by use case
428    pub use_cases: Option<UseCase>,
429}
430
431impl VoicesV2Filter {
432    /// Create a new empty filter
433    pub fn new() -> Self {
434        Self::default()
435    }
436
437    /// Filter by model
438    pub fn model(mut self, model: TTSModel) -> Self {
439        self.model = Some(model);
440        self
441    }
442
443    /// Filter by gender
444    pub fn gender(mut self, gender: Gender) -> Self {
445        self.gender = Some(gender);
446        self
447    }
448
449    /// Filter by age
450    pub fn age(mut self, age: Age) -> Self {
451        self.age = Some(age);
452        self
453    }
454
455    /// Filter by use case
456    pub fn use_cases(mut self, use_case: UseCase) -> Self {
457        self.use_cases = Some(use_case);
458        self
459    }
460}
461
462/// API error response
463#[derive(Debug, Clone, Serialize, Deserialize)]
464pub struct ErrorResponse {
465    /// Error message describing the issue
466    pub detail: String,
467}