Skip to main content

encode_reference/
encode_reference.rs

1//! Encode a reference WAV file to NeuCodec token IDs and save as `.npy`.
2//!
3//! This is the Rust equivalent of the Python pre-processing step:
4//!
5//! ```python
6//! from neutts import NeuTTS
7//! import numpy as np
8//! tts   = NeuTTS(codec_repo="neuphonic/neucodec")
9//! codes = tts.encode_reference("reference.wav")
10//! np.save("samples/my_voice.npy", codes.numpy().astype("int32"))
11//! ```
12//!
13//! ## One-time setup
14//!
15//! The NeuCodec encoder must be compiled into the binary first:
16//!
17//! ```sh
18//! cargo run --example download_models   # fetches ONNX → models/
19//! cargo build                           # burn-import converts + embeds weights
20//! ```
21//!
22//! ## Backend selection
23//!
24//! The **Wgpu** (GPU) backend is tried first; if the GPU stack is unavailable
25//! it falls back to **NdArray** (pure-Rust CPU) automatically.  The selected
26//! backend is printed after the encoder is initialised.
27//!
28//! Force CPU-only:
29//!
30//! ```sh
31//! cargo run --example encode_reference --no-default-features -- --audio reference.wav
32//! ```
33//!
34//! ## Usage
35//!
36//! ```sh
37//! # Encode a WAV (embedded weights, auto backend)
38//! cargo run --example encode_reference -- --audio reference.wav
39//!
40//! # Explicit output path
41//! cargo run --example encode_reference -- \
42//!   --audio reference.wav \
43//!   --out   samples/my_voice.npy
44//!
45//! # Load encoder weights from an external BurnPack file (.bpk)
46//! cargo run --example encode_reference -- \
47//!   --audio   reference.wav \
48//!   --encoder /path/to/neucodec_encoder.bpk \
49//!   --out     samples/my_voice.npy
50//! ```
51//!
52//! ## Notes
53//!
54//! - The WAV is resampled to 16 kHz mono automatically — any sample rate /
55//!   channel count / bit depth is accepted.
56//! - The `.npy` file can be passed to `basic` or `clone_voice` via
57//!   `--ref-codes` and is compatible with `np.load()` in Python.
58//! - Aim for 5–30 seconds of clean, noise-free speech for best results.
59
60use std::path::PathBuf;
61
62fn main() -> anyhow::Result<()> {
63    // ── Parse CLI arguments ───────────────────────────────────────────────────
64    let mut args = std::env::args().skip(1).peekable();
65
66    let mut encoder_bpk: Option<PathBuf> = None; // optional external .bpk weight file
67    let mut audio_path:  Option<PathBuf> = None;
68    let mut out_path:    Option<PathBuf> = None;
69
70    while let Some(arg) = args.next() {
71        match arg.as_str() {
72            "--encoder"      => { if let Some(v) = args.next() { encoder_bpk = Some(PathBuf::from(v)); } }
73            "--audio" | "-i" => { if let Some(v) = args.next() { audio_path = Some(PathBuf::from(v)); } }
74            "--out"   | "-o" => { if let Some(v) = args.next() { out_path   = Some(PathBuf::from(v)); } }
75            "--help"  | "-h" => { print_help(); return Ok(()); }
76            other => {
77                eprintln!("Unknown argument: {other}  (use --help for usage)");
78                std::process::exit(1);
79            }
80        }
81    }
82
83    // ── Validate inputs ───────────────────────────────────────────────────────
84    let audio_path = audio_path.ok_or_else(|| {
85        anyhow::anyhow!("No audio file specified.  Use --audio <path.wav>  (--help for more)")
86    })?;
87
88    if !audio_path.exists() {
89        anyhow::bail!("Audio file not found: {}", audio_path.display());
90    }
91
92    let out_path = out_path.unwrap_or_else(|| audio_path.with_extension("npy"));
93
94    // ── Print configuration ───────────────────────────────────────────────────
95    match &encoder_bpk {
96        Some(p) => println!("Encoder  : external BurnPack  {}", p.display()),
97        None    => println!("Encoder  : embedded weights  (wgpu → ndarray fallback)"),
98    }
99    println!("Audio    : {}", audio_path.display());
100    println!("Output   : {}", out_path.display());
101    println!();
102
103    // ── Load encoder ──────────────────────────────────────────────────────────
104    println!("Initialising encoder…");
105    let encoder = match encoder_bpk {
106        Some(ref p) => neutts::NeuCodecEncoder::load(p)
107            .map_err(|e| anyhow::anyhow!("Failed to load encoder from {}: {e}", p.display()))?,
108        None => neutts::NeuCodecEncoder::new()
109            .map_err(|e| anyhow::anyhow!(
110                "{e}\n\n\
111                 Run the one-time setup to embed the encoder:\n\
112                 \n\
113                 \tcargo run --example download_models\n\
114                 \tcargo build\n"
115            ))?,
116    };
117    println!("  → backend : {}", encoder.backend_name());
118    println!();
119
120    // ── Encode ────────────────────────────────────────────────────────────────
121    println!("Encoding {}…", audio_path.display());
122    let codes = encoder.encode_wav(&audio_path)?;
123
124    let duration_s = codes.len() as f32 / 50.0;
125    println!(
126        "  → {} tokens  ({:.2} s of audio at 50 tokens/s)",
127        codes.len(),
128        duration_s,
129    );
130
131    if duration_s < 3.0 {
132        eprintln!(
133            "WARNING: reference is only {duration_s:.1} s — \
134             5–30 s of clean speech gives the best cloning quality."
135        );
136    }
137
138    // ── Save ──────────────────────────────────────────────────────────────────
139    if let Some(parent) = out_path.parent() {
140        if !parent.as_os_str().is_empty() {
141            std::fs::create_dir_all(parent).ok();
142        }
143    }
144
145    neutts::npy::write_npy_i32(&out_path, &codes)?;
146    println!("Saved  →  {}", out_path.display());
147
148    println!();
149    println!(
150        "Use these codes for synthesis:\n\
151         \n\
152         \tcargo run --example basic --features espeak -- \\\n\
153         \t  --text       \"Your text here.\" \\\n\
154         \t  --ref-codes  {} \\\n\
155         \t  --ref-text   \"Transcript of the reference recording.\"",
156        out_path.display()
157    );
158
159    Ok(())
160}
161
162// ─────────────────────────────────────────────────────────────────────────────
163// Helpers
164// ─────────────────────────────────────────────────────────────────────────────
165
166fn print_help() {
167    println!(
168        "encode_reference — encode a WAV to NeuCodec token IDs (.npy)\n\
169         \n\
170         The NeuCodec encoder is compiled into the binary — no external ONNX Runtime.\n\
171         Wgpu (GPU) is tried first; falls back to NdArray (CPU) automatically.\n\
172         \n\
173         SETUP (one-time):\n\
174         \tcargo run --example download_models && cargo build\n\
175         \n\
176         USAGE:\n\
177         \tcargo run --example encode_reference -- [OPTIONS]\n\
178         \n\
179         OPTIONS:\n\
180         \t--audio  / -i  PATH  Input WAV file (required)\n\
181         \t--out    / -o  PATH  Output .npy (default: same stem as audio)\n\
182         \t--encoder      PATH  External BurnPack (.bpk) weight file\n\
183         \t                     (default: use weights embedded in binary)\n\
184         \t--help   / -h        Show this help\n\
185         \n\
186         FORCE CPU (no wgpu):\n\
187         \tcargo run --example encode_reference --no-default-features -- --audio <WAV>"
188    );
189}