rlx-tiny-tts 0.2.9

TinyTTS English text-to-speech (VITS2/MeloTTS, 44.1 kHz) for RLX — all backends
//! TinyTTS command-line synthesizer.
//!
//! ```text
//! rlx-tiny-tts --data weights/tiny-tts-rlx --text "Hello world." --out out.wav
//!              [--device cpu|metal|mlx|cuda|rocm|gpu] [--speaker MALE]
//!              [--speed 1.0] [--seed 1234]
//! ```

use std::path::PathBuf;

use anyhow::{Context, Result};
use rlx_tiny_tts::{InferOpts, TinyTts, audio};

fn opt(flag: &str) -> Option<String> {
    let args: Vec<String> = std::env::args().collect();
    args.iter()
        .position(|a| a == flag)
        .and_then(|i| args.get(i + 1).cloned())
}

fn main() -> Result<()> {
    let data = opt("--data").unwrap_or_else(|| "weights/tiny-tts-rlx".to_string());
    let text = opt("--text")
        .unwrap_or_else(|| "The weather is nice today, and I feel very relaxed.".to_string());
    let out = opt("--out").unwrap_or_else(|| "out.wav".to_string());
    let speaker = opt("--speaker").unwrap_or_else(|| "MALE".to_string());
    let speed: f32 = opt("--speed").and_then(|s| s.parse().ok()).unwrap_or(1.0);
    let seed: u64 = opt("--seed").and_then(|s| s.parse().ok()).unwrap_or(1234);

    let device = match opt("--device") {
        Some(d) => rlx_runtime::parse_device(&d).map_err(|e| anyhow::anyhow!("{e}"))?,
        None => TinyTts::preferred_device(),
    };

    let model = TinyTts::load_from_dir(&PathBuf::from(&data))
        .with_context(|| format!("load TinyTTS bundle from {data}"))?;

    let mut opts = InferOpts::from_config(model.config());
    opts.length_scale = 1.0 / speed.max(1e-3); // speed>1 → faster → fewer frames
    opts.seed = seed;
    let _ = model.config().speaker_id(&speaker); // validate speaker name

    let t0 = std::time::Instant::now();
    let wav = model.synthesize_on(&text, device, &opts)?;
    let secs = wav.samples.len() as f32 / wav.sample_rate as f32;
    let elapsed = t0.elapsed().as_secs_f32();

    let normalized = audio::normalize_audio(&wav.samples);
    audio::write_wav(&PathBuf::from(&out), &normalized, wav.sample_rate)?;
    println!(
        "[tiny-tts] device={device:?} \"{text}\"{out}  ({secs:.2}s audio @ {} Hz, {elapsed:.2}s synth, {:.1}× RT)",
        wav.sample_rate,
        secs / elapsed.max(1e-6),
    );
    Ok(())
}