any_tts/lib.rs
1//! # any-tts
2//!
3//! A Rust text-to-speech library powered primarily by the
4//! [candle](https://github.com/huggingface/candle) ML framework.
5//! Provides a unified trait-based API with pluggable model backends, including
6//! native Candle implementations and adapters for official upstream runtimes.
7//!
8//! ## Supported Models
9//!
10//! - **Kokoro-82M** — 82M parameter StyleTTS2 model with ISTFTNet decoder for fast, high-quality speech
11//! - **OmniVoice** — native Candle implementation of the OmniVoice zero-shot TTS model
12//! - **Qwen3-TTS-12Hz-1.7B-CustomVoice** — 1.7B parameter multi-codebook LM for 10 languages
13//! - **Qwen3-TTS-12Hz-1.7B-VoiceDesign** — 1.7B model with natural language voice descriptions
14//! - **VibeVoice-1.5B** — native Candle implementation of Microsoft's multi-speaker speech diffusion model
15//! - **VibeVoice-Realtime-0.5B** — native Candle implementation of Microsoft's cached-prompt realtime TTS model
16//! - **Voxtral-4B-TTS-2603** — native Candle implementation of Mistral's 4B TTS model
17//!
18//! ## Feature Flags
19//!
20//! - `cuda` — Enable CUDA GPU acceleration
21//! - `metal` — Enable Metal GPU acceleration (macOS/iOS)
22//! - `accelerate` — Enable Apple Accelerate framework
23//! - `kokoro` — Build Kokoro model support (default)
24//! - `omnivoice` — Build native OmniVoice support (default)
25//! - `qwen3-tts` — Build Qwen3-TTS model support (default)
26//! - `vibevoice` — Build native VibeVoice support (default)
27//! - `voxtral` — Build native Voxtral support (default)
28//! - `download` — Enable automatic model downloading from HuggingFace Hub (default)
29//!
30//! ## Quick Start
31//!
32//! ```rust,no_run
33//! use any_tts::{TtsModel, TtsConfig, SynthesisRequest, ModelType};
34//!
35//! // Load a model
36//! let config = TtsConfig::new(ModelType::Qwen3Tts)
37//! .with_model_path("/path/to/model");
38//! let model = any_tts::load_model(config).unwrap();
39//!
40//! // Synthesize speech
41//! let request = SynthesisRequest::new("Hello, world!")
42//! .with_language("en");
43//! let audio = model.synthesize(&request).unwrap();
44//!
45//! // audio.samples contains f32 PCM data at model.sample_rate() Hz
46//! let wav_bytes = audio.get_wav();
47//! let _ = wav_bytes;
48//! ```
49
50pub mod audio;
51pub mod config;
52pub mod device;
53pub mod error;
54pub mod layers;
55pub mod mel;
56pub mod models;
57pub mod tensor_utils;
58pub mod tokenizer;
59pub mod traits;
60
61#[cfg(feature = "download")]
62pub mod download;
63
64// Re-export primary API types
65pub use audio::{AudioSamples, DenoiseOptions};
66pub use config::{
67 preferred_runtime_choice, preferred_runtime_choices, DType, ModelAsset, ModelAssetBundle,
68 ModelAssetDir, ModelFiles, RuntimeChoice, TtsConfig,
69};
70pub use device::DeviceSelection;
71pub use error::TtsError;
72pub use mel::{MelConfig, MelSpectrogram};
73pub use models::{ModelAssetRequirement, ModelType};
74pub use traits::{
75 ModelInfo, ReferenceAudio, SynthesisRequest, TtsModel, VoiceCloning, VoiceEmbedding,
76};
77
78/// Load a TTS model based on the provided configuration.
79///
80/// This is the main entry point for creating a model instance. It dispatches
81/// to the appropriate model backend based on `config.model_type`.
82pub fn load_model(config: TtsConfig) -> Result<Box<dyn TtsModel>, TtsError> {
83 match config.model_type {
84 #[cfg(feature = "kokoro")]
85 ModelType::Kokoro => {
86 let model = models::kokoro::KokoroModel::load(config)?;
87 Ok(Box::new(model))
88 }
89 #[cfg(feature = "omnivoice")]
90 ModelType::OmniVoice => {
91 let model = models::omnivoice::OmniVoiceModel::load(config)?;
92 Ok(Box::new(model))
93 }
94 #[cfg(feature = "qwen3-tts")]
95 ModelType::Qwen3Tts => {
96 let model = models::qwen3_tts::Qwen3TtsModel::load(config)?;
97 Ok(Box::new(model))
98 }
99 #[cfg(feature = "vibevoice")]
100 ModelType::VibeVoice => {
101 let model = models::vibevoice::VibeVoiceModel::load(config)?;
102 Ok(Box::new(model))
103 }
104 #[cfg(feature = "vibevoice")]
105 ModelType::VibeVoiceRealtime => {
106 let model = models::vibevoice_realtime::VibeVoiceRealtimeModel::load(config)?;
107 Ok(Box::new(model))
108 }
109 #[cfg(feature = "voxtral")]
110 ModelType::Voxtral => {
111 let model = models::voxtral::VoxtralModel::load(config)?;
112 Ok(Box::new(model))
113 }
114 #[allow(unreachable_patterns)]
115 _ => Err(TtsError::UnsupportedModel(format!(
116 "Model type {:?} is not enabled. Enable the corresponding feature flag.",
117 config.model_type
118 ))),
119 }
120}