Skip to main content

any_tts/
lib.rs

1//! # any-tts
2//!
3//! A Rust text-to-speech library powered primarily by the
4//! [candle](https://github.com/huggingface/candle) ML framework.
5//! Provides a unified trait-based API with pluggable model backends, including
6//! native Candle implementations and adapters for official upstream runtimes.
7//!
8//! ## Supported Models
9//!
10//! - **Kokoro-82M** — 82M parameter StyleTTS2 model with ISTFTNet decoder for fast, high-quality speech
11//! - **OmniVoice** — native Candle implementation of the OmniVoice zero-shot TTS model
12//! - **Qwen3-TTS-12Hz-1.7B-CustomVoice** — 1.7B parameter multi-codebook LM for 10 languages
13//! - **Qwen3-TTS-12Hz-1.7B-VoiceDesign** — 1.7B model with natural language voice descriptions
14//! - **VibeVoice-1.5B** — native Candle implementation of Microsoft's multi-speaker speech diffusion model
15//! - **VibeVoice-Realtime-0.5B** — native Candle implementation of Microsoft's cached-prompt realtime TTS model
16//! - **Voxtral-4B-TTS-2603** — native Candle implementation of Mistral's 4B TTS model
17//!
18//! ## Feature Flags
19//!
20//! - `cuda` — Enable CUDA GPU acceleration
21//! - `metal` — Enable Metal GPU acceleration (macOS/iOS)
22//! - `accelerate` — Enable Apple Accelerate framework
23//! - `kokoro` — Build Kokoro model support (default)
24//! - `omnivoice` — Build native OmniVoice support (default)
25//! - `qwen3-tts` — Build Qwen3-TTS model support (default)
26//! - `vibevoice` — Build native VibeVoice support (default)
27//! - `voxtral` — Build native Voxtral support (default)
28//! - `download` — Enable automatic model downloading from HuggingFace Hub (default)
29//!
30//! ## Quick Start
31//!
32//! ```rust,no_run
33//! use any_tts::{TtsModel, TtsConfig, SynthesisRequest, ModelType};
34//!
35//! // Load a model
36//! let config = TtsConfig::new(ModelType::Qwen3Tts)
37//!     .with_model_path("/path/to/model");
38//! let model = any_tts::load_model(config).unwrap();
39//!
40//! // Synthesize speech
41//! let request = SynthesisRequest::new("Hello, world!")
42//!     .with_language("en");
43//! let audio = model.synthesize(&request).unwrap();
44//!
45//! // audio.samples contains f32 PCM data at model.sample_rate() Hz
46//! let wav_bytes = audio.get_wav();
47//! let _ = wav_bytes;
48//! ```
49
50pub mod audio;
51pub mod config;
52pub mod device;
53pub mod error;
54pub mod layers;
55pub mod mel;
56pub mod models;
57pub mod tensor_utils;
58pub mod tokenizer;
59pub mod traits;
60
61#[cfg(feature = "download")]
62pub mod download;
63
64// Re-export primary API types
65pub use audio::{AudioSamples, DenoiseOptions};
66pub use config::{
67    preferred_runtime_choice, preferred_runtime_choices, DType, ModelAsset, ModelAssetBundle,
68    ModelAssetDir, ModelFiles, RuntimeChoice, TtsConfig,
69};
70pub use device::DeviceSelection;
71pub use error::TtsError;
72pub use mel::{MelConfig, MelSpectrogram};
73pub use models::{ModelAssetRequirement, ModelType};
74pub use traits::{
75    ModelInfo, ReferenceAudio, SynthesisRequest, TtsModel, VoiceCloning, VoiceEmbedding,
76};
77
78/// Load a TTS model based on the provided configuration.
79///
80/// This is the main entry point for creating a model instance. It dispatches
81/// to the appropriate model backend based on `config.model_type`.
82pub fn load_model(config: TtsConfig) -> Result<Box<dyn TtsModel>, TtsError> {
83    match config.model_type {
84        #[cfg(feature = "kokoro")]
85        ModelType::Kokoro => {
86            let model = models::kokoro::KokoroModel::load(config)?;
87            Ok(Box::new(model))
88        }
89        #[cfg(feature = "omnivoice")]
90        ModelType::OmniVoice => {
91            let model = models::omnivoice::OmniVoiceModel::load(config)?;
92            Ok(Box::new(model))
93        }
94        #[cfg(feature = "qwen3-tts")]
95        ModelType::Qwen3Tts => {
96            let model = models::qwen3_tts::Qwen3TtsModel::load(config)?;
97            Ok(Box::new(model))
98        }
99        #[cfg(feature = "vibevoice")]
100        ModelType::VibeVoice => {
101            let model = models::vibevoice::VibeVoiceModel::load(config)?;
102            Ok(Box::new(model))
103        }
104        #[cfg(feature = "vibevoice")]
105        ModelType::VibeVoiceRealtime => {
106            let model = models::vibevoice_realtime::VibeVoiceRealtimeModel::load(config)?;
107            Ok(Box::new(model))
108        }
109        #[cfg(feature = "voxtral")]
110        ModelType::Voxtral => {
111            let model = models::voxtral::VoxtralModel::load(config)?;
112            Ok(Box::new(model))
113        }
114        #[allow(unreachable_patterns)]
115        _ => Err(TtsError::UnsupportedModel(format!(
116            "Model type {:?} is not enabled. Enable the corresponding feature flag.",
117            config.model_type
118        ))),
119    }
120}