Skip to main content

any_tts/models/
mod.rs

1//! Model backends for TTS synthesis.
2
3#[cfg(feature = "kokoro")]
4pub mod kokoro;
5
6#[cfg(feature = "omnivoice")]
7pub mod omnivoice;
8
9#[cfg(feature = "qwen3-tts")]
10pub mod qwen3_tts;
11
12#[cfg(feature = "vibevoice")]
13pub mod vibevoice;
14
15#[cfg(feature = "vibevoice")]
16pub mod vibevoice_realtime;
17
18#[cfg(feature = "voxtral")]
19pub mod voxtral;
20
21/// A documented model asset requirement or optional asset pattern.
22#[derive(Debug, Clone, Copy, PartialEq, Eq)]
23pub struct ModelAssetRequirement {
24    pub pattern: &'static str,
25    pub required: bool,
26    pub purpose: &'static str,
27}
28
29const KOKORO_ASSETS: &[ModelAssetRequirement] = &[
30    ModelAssetRequirement {
31        pattern: "config.json",
32        required: true,
33        purpose: "Model architecture and phoneme vocabulary.",
34    },
35    ModelAssetRequirement {
36        pattern: "model.safetensors | *.pth",
37        required: true,
38        purpose: "Main Kokoro weights.",
39    },
40    ModelAssetRequirement {
41        pattern: "voices/*.pt",
42        required: false,
43        purpose: "Preset voice packs for named-voice synthesis.",
44    },
45];
46
47const OMNIVOICE_ASSETS: &[ModelAssetRequirement] = &[
48    ModelAssetRequirement {
49        pattern: "config.json",
50        required: true,
51        purpose: "Main OmniVoice config.",
52    },
53    ModelAssetRequirement {
54        pattern: "tokenizer.json",
55        required: true,
56        purpose: "Text tokenizer.",
57    },
58    ModelAssetRequirement {
59        pattern: "model.safetensors | model-*-of-*.safetensors",
60        required: true,
61        purpose: "Main OmniVoice weights.",
62    },
63    ModelAssetRequirement {
64        pattern: "audio_tokenizer/config.json",
65        required: true,
66        purpose: "Codec decoder config.",
67    },
68    ModelAssetRequirement {
69        pattern: "audio_tokenizer/model.safetensors | audio_tokenizer/model-*-of-*.safetensors",
70        required: true,
71        purpose: "Codec decoder weights.",
72    },
73];
74
75const QWEN3_TTS_ASSETS: &[ModelAssetRequirement] = &[
76    ModelAssetRequirement {
77        pattern: "config.json",
78        required: true,
79        purpose: "Main talker/code-predictor config.",
80    },
81    ModelAssetRequirement {
82        pattern: "tokenizer.json",
83        required: true,
84        purpose: "Text tokenizer.",
85    },
86    ModelAssetRequirement {
87        pattern: "model.safetensors | model-*-of-*.safetensors",
88        required: true,
89        purpose: "Main Qwen3-TTS weights.",
90    },
91    ModelAssetRequirement {
92        pattern: "speech_tokenizer/model.safetensors | speech_tokenizer/model-*-of-*.safetensors",
93        required: true,
94        purpose: "Speech-tokenizer decoder weights.",
95    },
96    ModelAssetRequirement {
97        pattern: "speech_tokenizer/config.json",
98        required: false,
99        purpose: "Optional speech-tokenizer config when it is stored beside the main assets.",
100    },
101];
102
103const VIBEVOICE_ASSETS: &[ModelAssetRequirement] = &[
104    ModelAssetRequirement {
105        pattern: "config.json",
106        required: true,
107        purpose: "Main VibeVoice config.",
108    },
109    ModelAssetRequirement {
110        pattern: "tokenizer.json",
111        required: true,
112        purpose: "Text tokenizer.",
113    },
114    ModelAssetRequirement {
115        pattern: "model.safetensors | model-*-of-*.safetensors",
116        required: true,
117        purpose: "Unified VibeVoice weights.",
118    },
119    ModelAssetRequirement {
120        pattern: "preprocessor_config.json",
121        required: false,
122        purpose: "Published preprocessing defaults.",
123    },
124];
125
126const VIBEVOICE_REALTIME_ASSETS: &[ModelAssetRequirement] = &[
127    ModelAssetRequirement {
128        pattern: "config.json",
129        required: true,
130        purpose: "Main VibeVoice Realtime config.",
131    },
132    ModelAssetRequirement {
133        pattern: "tokenizer.json",
134        required: true,
135        purpose: "Text tokenizer.",
136    },
137    ModelAssetRequirement {
138        pattern: "model.safetensors",
139        required: true,
140        purpose: "Realtime VibeVoice weights.",
141    },
142    ModelAssetRequirement {
143        pattern: "preprocessor_config.json",
144        required: false,
145        purpose: "Published preprocessing defaults.",
146    },
147    ModelAssetRequirement {
148        pattern: "voices/*.pt",
149        required: false,
150        purpose: "Optional cached-prompt voice presets from the upstream demo bundle.",
151    },
152];
153
154const VOXTRAL_ASSETS: &[ModelAssetRequirement] = &[
155    ModelAssetRequirement {
156        pattern: "params.json",
157        required: true,
158        purpose: "Main Voxtral config.",
159    },
160    ModelAssetRequirement {
161        pattern: "tekken.json",
162        required: true,
163        purpose: "Tekken tokenizer.",
164    },
165    ModelAssetRequirement {
166        pattern: "consolidated.safetensors",
167        required: true,
168        purpose: "Main Voxtral weights.",
169    },
170    ModelAssetRequirement {
171        pattern: "voice_embedding/*.pt",
172        required: true,
173        purpose: "Preset voice embeddings.",
174    },
175];
176
177/// Supported model types.
178#[derive(Debug, Clone, Copy, PartialEq, Eq)]
179pub enum ModelType {
180    /// Kokoro-82M: 82M parameter StyleTTS2 model with ISTFTNet decoder.
181    Kokoro,
182    /// OmniVoice: native Candle implementation for omnilingual zero-shot TTS.
183    OmniVoice,
184    /// Qwen3-TTS-12Hz-1.7B-CustomVoice: 1.7B multi-codebook LM.
185    Qwen3Tts,
186    /// VibeVoice-1.5B: native Candle implementation with diffusion speech tokens.
187    VibeVoice,
188    /// VibeVoice-Realtime-0.5B: native Candle implementation with cached prompt presets.
189    VibeVoiceRealtime,
190    /// Voxtral-4B-TTS-2603: native Candle implementation.
191    Voxtral,
192}
193
194impl ModelType {
195    /// Return the documented asset layout for this backend.
196    pub fn asset_requirements(self) -> &'static [ModelAssetRequirement] {
197        match self {
198            Self::Kokoro => KOKORO_ASSETS,
199            Self::OmniVoice => OMNIVOICE_ASSETS,
200            Self::Qwen3Tts => QWEN3_TTS_ASSETS,
201            Self::VibeVoice => VIBEVOICE_ASSETS,
202            Self::VibeVoiceRealtime => VIBEVOICE_REALTIME_ASSETS,
203            Self::Voxtral => VOXTRAL_ASSETS,
204        }
205    }
206}