1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
//! # neutts
//!
//! Rust port of [NeuTTS](https://github.com/neuphonic/neutts) —
//! an on-device voice-cloning TTS system built on a GGUF LLM backbone and
//! the NeuCodec neural audio codec (pure-Rust CPU inference, no ONNX Runtime).
//!
//! ## Architecture
//!
//! ```text
//! text ──► espeak-ng ──► IPA ──► GGUF backbone ──► speech tokens ──► NeuCodec decoder ──► audio
//! + ref codes ──►
//! ```
//!
//! 1. **GGUF backbone** (`llama-cpp-4`) — a small causal LM that generates speech token IDs.
//! 2. **NeuCodec decoder** — pure-Rust FSQ+Vocos+ISTFT decoder; 24 kHz output.
//!
//! ## One-time setup
//!
//! ```sh
//! pip install torch huggingface_hub safetensors
//! python scripts/convert_weights.py # download + extract decoder weights
//! cargo build # codec weights loaded at runtime
//! ```
//!
//! ## Quick start
//!
//! ```ignore
//! use neutts::{NeuTTS, download};
//! use std::path::Path;
//!
//! let tts = download::load_from_hub("neuphonic/neutts-nano-q4-gguf").unwrap();
//! let ref_codes = tts.load_ref_codes(Path::new("samples/jo.npy")).unwrap();
//! let audio = tts.infer("Hello from Rust!", &ref_codes, "Reference transcript.").unwrap();
//! tts.write_wav(&audio, Path::new("output.wav")).unwrap();
//! ```
//!
//! ## Features
//!
//! | Feature | Default | Effect |
//! |------------|---------|------------------------------------------------------------------------------|
//! | `backbone` | ✓ | GGUF backbone via llama-cpp-4 (requires cmake + C++) |
//! | `espeak` | | Raw-text input via pure-Rust espeak-ng (114 bundled languages, no system deps) |
//! | `wgpu` | | GPU-accelerated codec via Burn wgpu; falls back to Burn NdArray then ndarray |
//! | `metal` | | macOS Metal GPU for the backbone (passed to llama-cpp-4) |
//! | `cuda` | | NVIDIA CUDA for the backbone (passed to llama-cpp-4) |
//! | `fast` | ✓ | RoPE: degree-7/6 Horner polynomial, no transcendental calls (~1e-4 error) |
//! | `precise` | | RoPE: stdlib `f32::sin_cos()`, correctly rounded; mutually exclusive w/ fast |
// HuggingFace Hub download — desktop only (hf-hub needs OpenSSL).
/// Burn wgpu/NdArray backend for the NeuCodec decoder.
/// Only compiled when the `wgpu` Cargo feature is enabled.
pub
// ─── Re-exports ───────────────────────────────────────────────────────────────
/// The main TTS handle.
pub use NeuTTS;
/// Disk cache for pre-encoded reference codes, keyed by SHA-256 of the WAV.
pub use RefCodeCache;
/// Result of a [`RefCodeCache`] lookup.
pub use CacheOutcome;
/// NeuCodec encoder stub (encoder not yet implemented in pure-Rust build).
pub use NeuCodecEncoder;
/// NeuCodec decoder — converts speech token IDs to 24 kHz audio.
pub use NeuCodecDecoder;
/// Decoder output sample rate (24 000 Hz).
pub use SAMPLE_RATE;
/// Encoder input sample rate (16 000 Hz).
pub use ENCODER_SAMPLE_RATE;
/// Decoder: audio samples per token (hop_length, nominally 480 = 24 000 / 50).
pub use SAMPLES_PER_TOKEN;
/// Encoder: audio samples consumed per token (320 = 16 000 / 50).
pub use ENCODER_SAMPLES_PER_TOKEN;