Skip to main content

typecast_rust/
models.rs

1//! Data models for the Typecast API
2//!
3//! This module contains all the data structures used for API requests and responses.
4
5use serde::{Deserialize, Serialize};
6
7/// TTS model version to use for speech synthesis
8#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
9pub enum TTSModel {
10    /// Latest model with improved prosody and additional emotion presets (recommended)
11    #[serde(rename = "ssfm-v30")]
12    SsfmV30,
13    /// Stable production model with proven reliability and consistent quality
14    #[serde(rename = "ssfm-v21")]
15    SsfmV21,
16}
17
18impl Default for TTSModel {
19    fn default() -> Self {
20        TTSModel::SsfmV30
21    }
22}
23
24/// Emotion preset types for speech synthesis
25#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
26#[serde(rename_all = "lowercase")]
27pub enum EmotionPreset {
28    /// Neutral, balanced tone
29    Normal,
30    /// Bright, cheerful expression
31    Happy,
32    /// Melancholic, subdued tone
33    Sad,
34    /// Strong, intense delivery
35    Angry,
36    /// Soft, quiet speech (ssfm-v30 only)
37    Whisper,
38    /// Higher tonal emphasis (ssfm-v30 only)
39    #[serde(rename = "toneup")]
40    ToneUp,
41    /// Lower tonal emphasis (ssfm-v30 only)
42    #[serde(rename = "tonedown")]
43    ToneDown,
44}
45
46impl Default for EmotionPreset {
47    fn default() -> Self {
48        EmotionPreset::Normal
49    }
50}
51
52/// Audio output format
53#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
54#[serde(rename_all = "lowercase")]
55pub enum AudioFormat {
56    /// Uncompressed PCM audio (16-bit depth, mono, 44100 Hz)
57    Wav,
58    /// Compressed MPEG Layer III audio (320 kbps, 44100 Hz)
59    Mp3,
60}
61
62impl Default for AudioFormat {
63    fn default() -> Self {
64        AudioFormat::Wav
65    }
66}
67
68/// Gender classification for voices
69#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
70#[serde(rename_all = "lowercase")]
71pub enum Gender {
72    Male,
73    Female,
74}
75
76/// Age group classification for voices
77#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
78#[serde(rename_all = "snake_case")]
79pub enum Age {
80    /// Child voice (under 12 years old)
81    Child,
82    /// Teenage voice (13-19 years old)
83    Teenager,
84    /// Young adult voice (20-35 years old)
85    YoungAdult,
86    /// Middle-aged voice (36-60 years old)
87    MiddleAge,
88    /// Elder voice (over 60 years old)
89    Elder,
90}
91
92/// Voice use case categories
93#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
94pub enum UseCase {
95    Announcer,
96    Anime,
97    Audiobook,
98    Conversational,
99    Documentary,
100    #[serde(rename = "E-learning")]
101    ELearning,
102    Rapper,
103    Game,
104    #[serde(rename = "Tiktok/Reels")]
105    TikTokReels,
106    News,
107    Podcast,
108    Voicemail,
109    Ads,
110}
111
112/// Audio output settings
113#[derive(Debug, Clone, Default, Serialize, Deserialize)]
114pub struct Output {
115    /// Volume level (0-200, default: 100).
116    /// Cannot be used simultaneously with target_lufs.
117    #[serde(skip_serializing_if = "Option::is_none")]
118    pub volume: Option<i32>,
119    /// Target loudness in LUFS for absolute loudness normalization (-70 to 0).
120    /// Cannot be used simultaneously with volume.
121    #[serde(skip_serializing_if = "Option::is_none")]
122    pub target_lufs: Option<f64>,
123    /// Pitch adjustment in semitones (-12 to +12, default: 0)
124    #[serde(skip_serializing_if = "Option::is_none")]
125    pub audio_pitch: Option<i32>,
126    /// Speech speed multiplier (0.5 to 2.0, default: 1.0)
127    #[serde(skip_serializing_if = "Option::is_none")]
128    pub audio_tempo: Option<f64>,
129    /// Output audio format (wav or mp3, default: wav)
130    #[serde(skip_serializing_if = "Option::is_none")]
131    pub audio_format: Option<AudioFormat>,
132}
133
134impl Output {
135    /// Create a new Output with default values
136    pub fn new() -> Self {
137        Self::default()
138    }
139
140    /// Set the volume (0-200).
141    /// Cannot be used simultaneously with target_lufs.
142    pub fn volume(mut self, volume: i32) -> Self {
143        self.volume = Some(volume.clamp(0, 200));
144        self
145    }
146
147    /// Set the target LUFS (-70 to 0)
148    pub fn target_lufs(mut self, lufs: f64) -> Self {
149        self.target_lufs = Some(lufs.clamp(-70.0, 0.0));
150        self
151    }
152
153    /// Set the audio pitch (-12 to +12 semitones)
154    pub fn audio_pitch(mut self, pitch: i32) -> Self {
155        self.audio_pitch = Some(pitch.clamp(-12, 12));
156        self
157    }
158
159    /// Set the audio tempo (0.5 to 2.0)
160    pub fn audio_tempo(mut self, tempo: f64) -> Self {
161        self.audio_tempo = Some(tempo.clamp(0.5, 2.0));
162        self
163    }
164
165    /// Set the audio format
166    pub fn audio_format(mut self, format: AudioFormat) -> Self {
167        self.audio_format = Some(format);
168        self
169    }
170}
171
172/// Audio output settings for streaming TTS requests.
173///
174/// Identical to [`Output`] but without `volume` or `target_lufs`, which are
175/// not supported by the streaming endpoint.
176#[derive(Debug, Clone, Default, Serialize, Deserialize)]
177pub struct OutputStream {
178    /// Pitch adjustment in semitones (-12 to +12, default: 0)
179    #[serde(skip_serializing_if = "Option::is_none")]
180    pub audio_pitch: Option<i32>,
181    /// Speech speed multiplier (0.5 to 2.0, default: 1.0)
182    #[serde(skip_serializing_if = "Option::is_none")]
183    pub audio_tempo: Option<f64>,
184    /// Output audio format (wav or mp3, default: wav)
185    #[serde(skip_serializing_if = "Option::is_none")]
186    pub audio_format: Option<AudioFormat>,
187}
188
189impl OutputStream {
190    /// Create a new OutputStream with default values
191    pub fn new() -> Self {
192        Self::default()
193    }
194
195    /// Set the audio pitch (-12 to +12 semitones)
196    pub fn audio_pitch(mut self, pitch: i32) -> Self {
197        self.audio_pitch = Some(pitch.clamp(-12, 12));
198        self
199    }
200
201    /// Set the audio tempo (0.5 to 2.0)
202    pub fn audio_tempo(mut self, tempo: f64) -> Self {
203        self.audio_tempo = Some(tempo.clamp(0.5, 2.0));
204        self
205    }
206
207    /// Set the audio format
208    pub fn audio_format(mut self, format: AudioFormat) -> Self {
209        self.audio_format = Some(format);
210        self
211    }
212}
213
214/// Emotion settings for ssfm-v21 model
215#[derive(Debug, Clone, Default, Serialize, Deserialize)]
216pub struct Prompt {
217    /// Emotion preset to apply
218    #[serde(skip_serializing_if = "Option::is_none")]
219    pub emotion_preset: Option<EmotionPreset>,
220    /// Emotion intensity (0.0 to 2.0, default: 1.0)
221    #[serde(skip_serializing_if = "Option::is_none")]
222    pub emotion_intensity: Option<f64>,
223}
224
225impl Prompt {
226    /// Create a new Prompt with default values
227    pub fn new() -> Self {
228        Self::default()
229    }
230
231    /// Set the emotion preset
232    pub fn emotion_preset(mut self, preset: EmotionPreset) -> Self {
233        self.emotion_preset = Some(preset);
234        self
235    }
236
237    /// Set the emotion intensity (0.0 to 2.0)
238    pub fn emotion_intensity(mut self, intensity: f64) -> Self {
239        self.emotion_intensity = Some(intensity.clamp(0.0, 2.0));
240        self
241    }
242}
243
244/// Preset-based emotion control for ssfm-v30 model
245#[derive(Debug, Clone, Serialize, Deserialize)]
246pub struct PresetPrompt {
247    /// Must be "preset" for preset-based emotion control
248    pub emotion_type: String,
249    /// Emotion preset to apply
250    #[serde(skip_serializing_if = "Option::is_none")]
251    pub emotion_preset: Option<EmotionPreset>,
252    /// Emotion intensity (0.0 to 2.0, default: 1.0)
253    #[serde(skip_serializing_if = "Option::is_none")]
254    pub emotion_intensity: Option<f64>,
255}
256
257impl Default for PresetPrompt {
258    fn default() -> Self {
259        Self {
260            emotion_type: "preset".to_string(),
261            emotion_preset: None,
262            emotion_intensity: None,
263        }
264    }
265}
266
267impl PresetPrompt {
268    /// Create a new PresetPrompt
269    pub fn new() -> Self {
270        Self::default()
271    }
272
273    /// Set the emotion preset
274    pub fn emotion_preset(mut self, preset: EmotionPreset) -> Self {
275        self.emotion_preset = Some(preset);
276        self
277    }
278
279    /// Set the emotion intensity (0.0 to 2.0)
280    pub fn emotion_intensity(mut self, intensity: f64) -> Self {
281        self.emotion_intensity = Some(intensity.clamp(0.0, 2.0));
282        self
283    }
284}
285
286/// Context-aware emotion inference for ssfm-v30 model
287#[derive(Debug, Clone, Serialize, Deserialize)]
288pub struct SmartPrompt {
289    /// Must be "smart" for context-aware emotion inference
290    pub emotion_type: String,
291    /// Text that comes before the main text (max 2000 chars)
292    #[serde(skip_serializing_if = "Option::is_none")]
293    pub previous_text: Option<String>,
294    /// Text that comes after the main text (max 2000 chars)
295    #[serde(skip_serializing_if = "Option::is_none")]
296    pub next_text: Option<String>,
297}
298
299impl Default for SmartPrompt {
300    fn default() -> Self {
301        Self {
302            emotion_type: "smart".to_string(),
303            previous_text: None,
304            next_text: None,
305        }
306    }
307}
308
309impl SmartPrompt {
310    /// Create a new SmartPrompt
311    pub fn new() -> Self {
312        Self::default()
313    }
314
315    /// Set the previous text for context
316    pub fn previous_text(mut self, text: impl Into<String>) -> Self {
317        self.previous_text = Some(text.into());
318        self
319    }
320
321    /// Set the next text for context
322    pub fn next_text(mut self, text: impl Into<String>) -> Self {
323        self.next_text = Some(text.into());
324        self
325    }
326}
327
328/// Union type for all prompt types
329#[derive(Debug, Clone, Serialize, Deserialize)]
330#[serde(untagged)]
331pub enum TTSPrompt {
332    /// Basic emotion control (ssfm-v21 compatible)
333    Basic(Prompt),
334    /// Explicit preset emotion control (ssfm-v30)
335    Preset(PresetPrompt),
336    /// Context-aware emotion inference (ssfm-v30)
337    Smart(SmartPrompt),
338}
339
340impl From<Prompt> for TTSPrompt {
341    fn from(prompt: Prompt) -> Self {
342        TTSPrompt::Basic(prompt)
343    }
344}
345
346impl From<PresetPrompt> for TTSPrompt {
347    fn from(prompt: PresetPrompt) -> Self {
348        TTSPrompt::Preset(prompt)
349    }
350}
351
352impl From<SmartPrompt> for TTSPrompt {
353    fn from(prompt: SmartPrompt) -> Self {
354        TTSPrompt::Smart(prompt)
355    }
356}
357
358/// Text-to-Speech request parameters
359#[derive(Debug, Clone, Serialize, Deserialize)]
360pub struct TTSRequest {
361    /// Voice ID in format 'tc_' followed by a unique identifier
362    pub voice_id: String,
363    /// Text to convert to speech (max 2000 chars)
364    pub text: String,
365    /// TTS model to use
366    pub model: TTSModel,
367    /// Language code (ISO 639-3). Auto-detected if not provided
368    #[serde(skip_serializing_if = "Option::is_none")]
369    pub language: Option<String>,
370    /// Emotion and style settings
371    #[serde(skip_serializing_if = "Option::is_none")]
372    pub prompt: Option<TTSPrompt>,
373    /// Audio output settings
374    #[serde(skip_serializing_if = "Option::is_none")]
375    pub output: Option<Output>,
376    /// Random seed for reproducible results
377    #[serde(skip_serializing_if = "Option::is_none")]
378    pub seed: Option<i32>,
379}
380
381impl TTSRequest {
382    /// Create a new TTSRequest with required fields
383    pub fn new(voice_id: impl Into<String>, text: impl Into<String>, model: TTSModel) -> Self {
384        Self {
385            voice_id: voice_id.into(),
386            text: text.into(),
387            model,
388            language: None,
389            prompt: None,
390            output: None,
391            seed: None,
392        }
393    }
394
395    /// Set the language code (ISO 639-3)
396    pub fn language(mut self, language: impl Into<String>) -> Self {
397        self.language = Some(language.into());
398        self
399    }
400
401    /// Set the prompt (emotion settings)
402    pub fn prompt(mut self, prompt: impl Into<TTSPrompt>) -> Self {
403        self.prompt = Some(prompt.into());
404        self
405    }
406
407    /// Set the output settings
408    pub fn output(mut self, output: Output) -> Self {
409        self.output = Some(output);
410        self
411    }
412
413    /// Set the random seed for reproducible results
414    pub fn seed(mut self, seed: i32) -> Self {
415        self.seed = Some(seed);
416        self
417    }
418}
419
420/// Streaming Text-to-Speech request parameters.
421///
422/// Mirrors [`TTSRequest`] but the `output` field uses [`OutputStream`], which
423/// excludes `volume` and `target_lufs` (not supported by the streaming endpoint).
424#[derive(Debug, Clone, Serialize, Deserialize)]
425pub struct TTSRequestStream {
426    /// Voice ID in format 'tc_' followed by a unique identifier
427    pub voice_id: String,
428    /// Text to convert to speech (max 2000 chars)
429    pub text: String,
430    /// TTS model to use
431    pub model: TTSModel,
432    /// Language code (ISO 639-3). Auto-detected if not provided
433    #[serde(skip_serializing_if = "Option::is_none")]
434    pub language: Option<String>,
435    /// Emotion and style settings
436    #[serde(skip_serializing_if = "Option::is_none")]
437    pub prompt: Option<TTSPrompt>,
438    /// Audio output settings (without volume/target_lufs)
439    #[serde(skip_serializing_if = "Option::is_none")]
440    pub output: Option<OutputStream>,
441    /// Random seed for reproducible results
442    #[serde(skip_serializing_if = "Option::is_none")]
443    pub seed: Option<i32>,
444}
445
446impl TTSRequestStream {
447    /// Create a new TTSRequestStream with required fields
448    pub fn new(voice_id: impl Into<String>, text: impl Into<String>, model: TTSModel) -> Self {
449        Self {
450            voice_id: voice_id.into(),
451            text: text.into(),
452            model,
453            language: None,
454            prompt: None,
455            output: None,
456            seed: None,
457        }
458    }
459
460    /// Set the language code (ISO 639-3)
461    pub fn language(mut self, language: impl Into<String>) -> Self {
462        self.language = Some(language.into());
463        self
464    }
465
466    /// Set the prompt (emotion settings)
467    pub fn prompt(mut self, prompt: impl Into<TTSPrompt>) -> Self {
468        self.prompt = Some(prompt.into());
469        self
470    }
471
472    /// Set the output settings
473    pub fn output(mut self, output: OutputStream) -> Self {
474        self.output = Some(output);
475        self
476    }
477
478    /// Set the random seed for reproducible results
479    pub fn seed(mut self, seed: i32) -> Self {
480        self.seed = Some(seed);
481        self
482    }
483}
484
485/// Text-to-Speech response
486#[derive(Debug, Clone)]
487pub struct TTSResponse {
488    /// Generated audio data
489    pub audio_data: Vec<u8>,
490    /// Audio duration in seconds
491    pub duration: f64,
492    /// Audio format (wav or mp3)
493    pub format: AudioFormat,
494}
495
496/// Model information with supported emotions
497#[derive(Debug, Clone, Serialize, Deserialize)]
498pub struct ModelInfo {
499    /// TTS model version
500    pub version: TTSModel,
501    /// List of supported emotions for this model
502    pub emotions: Vec<String>,
503}
504
505/// Voice from V2 API with enhanced metadata
506#[derive(Debug, Clone, Serialize, Deserialize)]
507pub struct VoiceV2 {
508    /// Unique voice identifier
509    pub voice_id: String,
510    /// Human-readable name of the voice
511    pub voice_name: String,
512    /// List of supported TTS models with their emotions
513    pub models: Vec<ModelInfo>,
514    /// Voice gender classification
515    #[serde(skip_serializing_if = "Option::is_none")]
516    pub gender: Option<Gender>,
517    /// Voice age group classification
518    #[serde(skip_serializing_if = "Option::is_none")]
519    pub age: Option<Age>,
520    /// List of use case categories
521    #[serde(skip_serializing_if = "Option::is_none")]
522    pub use_cases: Option<Vec<String>>,
523}
524
525/// Filter options for V2 voices endpoint
526#[derive(Debug, Clone, Default)]
527pub struct VoicesV2Filter {
528    /// Filter by TTS model
529    pub model: Option<TTSModel>,
530    /// Filter by gender
531    pub gender: Option<Gender>,
532    /// Filter by age group
533    pub age: Option<Age>,
534    /// Filter by use case
535    pub use_cases: Option<UseCase>,
536}
537
538impl VoicesV2Filter {
539    /// Create a new empty filter
540    pub fn new() -> Self {
541        Self::default()
542    }
543
544    /// Filter by model
545    pub fn model(mut self, model: TTSModel) -> Self {
546        self.model = Some(model);
547        self
548    }
549
550    /// Filter by gender
551    pub fn gender(mut self, gender: Gender) -> Self {
552        self.gender = Some(gender);
553        self
554    }
555
556    /// Filter by age
557    pub fn age(mut self, age: Age) -> Self {
558        self.age = Some(age);
559        self
560    }
561
562    /// Filter by use case
563    pub fn use_cases(mut self, use_case: UseCase) -> Self {
564        self.use_cases = Some(use_case);
565        self
566    }
567}
568
569/// Subscription plan tier
570#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
571#[serde(rename_all = "lowercase")]
572pub enum PlanTier {
573    /// Free plan
574    Free,
575    /// Lite paid plan
576    Lite,
577    /// Plus paid plan
578    Plus,
579    /// Custom enterprise plan
580    Custom,
581}
582
583/// Credit usage information
584#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
585pub struct Credits {
586    /// Total credits provided by the plan
587    pub plan_credits: i64,
588    /// Number of credits used
589    pub used_credits: i64,
590}
591
592/// Usage limit information
593#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
594pub struct Limits {
595    /// Maximum number of concurrent requests allowed
596    pub concurrency_limit: i64,
597}
598
599/// Response from `GET /v1/users/me/subscription`
600#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
601pub struct SubscriptionResponse {
602    /// Current subscription plan tier
603    pub plan: PlanTier,
604    /// Credit usage information
605    pub credits: Credits,
606    /// Usage limit information
607    pub limits: Limits,
608}
609
610/// API error response
611#[derive(Debug, Clone, Serialize, Deserialize)]
612pub struct ErrorResponse {
613    /// Error message describing the issue
614    pub detail: String,
615}