use std::io::{self, Write as _};
use std::path::PathBuf;
use std::time::Instant;
use anyhow::Context as _;
fn main() -> anyhow::Result<()> {
let mut codes_path: Option<PathBuf> = None;
let mut ref_text_arg: Option<String> = None;
let mut text = String::new();
let mut backbone = "neuphonic/neutts-nano-q4-gguf".to_string();
let mut gguf_file: Option<String> = None;
let mut chunk_size: usize = 25;
let mut args = std::env::args().skip(1).peekable();
while let Some(arg) = args.next() {
match arg.as_str() {
"--codes" | "-c" => codes_path = args.next().map(PathBuf::from),
"--ref-text" | "-r" => ref_text_arg = args.next(),
"--text" | "-t" => text = args.next().unwrap_or_default(),
"--backbone" | "-b" => backbone = args.next().unwrap_or(backbone),
"--gguf-file" | "-g" => gguf_file = args.next(),
"--chunk" | "-k" => {
chunk_size = args.next()
.as_deref()
.and_then(|s| s.parse().ok())
.unwrap_or(chunk_size);
}
"--help" | "-h" => { print_help(); return Ok(()); }
other => {
anyhow::bail!("Unknown argument: {other}\nRun with --help for usage.");
}
}
}
if text.is_empty() {
anyhow::bail!("--text is required.\n\nRun with --help for usage.");
}
let codes_path = codes_path
.ok_or_else(|| anyhow::anyhow!("--codes <path.npy> is required.\n\nRun with --help for usage."))?;
let ref_text = match ref_text_arg {
Some(v) => {
let p = std::path::Path::new(&v);
if p.exists() {
std::fs::read_to_string(p).map(|s| s.trim().to_string()).unwrap_or(v)
} else {
v
}
}
None => {
let sibling = codes_path.with_extension("txt");
if sibling.exists() {
eprintln!("[stream_pcm] Auto-loaded ref text from {}", sibling.display());
std::fs::read_to_string(&sibling)
.map(|s| s.trim().to_string())
.unwrap_or_default()
} else {
anyhow::bail!(
"--ref-text is required (transcript of the reference recording).\n\
Pass a string or a path to a .txt file."
);
}
}
};
if ref_text.is_empty() {
anyhow::bail!("Reference text is empty — provide a non-empty transcript.");
}
eprintln!("┌─ stream_pcm ────────────────────────────────────────────────────");
eprintln!("│ backbone : {backbone}");
eprintln!("│ codes : {}", codes_path.display());
eprintln!("│ ref text : {:?}", truncate(&ref_text, 72));
eprintln!("│ text : {:?}", truncate(&text, 72));
eprintln!("│ chunk : {chunk_size} tokens ({:.0} ms)", chunk_size as f32 * 1000.0 / 50.0);
eprintln!("│ output : stdout (raw s16le, 24 kHz, mono)");
eprintln!("└─────────────────────────────────────────────────────────────────");
eprintln!();
eprintln!("[stream_pcm] Preloading models…");
let t_load = Instant::now();
let tts = neutts::download::load_from_hub_cb(&backbone, gguf_file.as_deref(), |p| {
use neutts::download::LoadProgress;
match &p {
LoadProgress::Fetching { step, total, file, repo, .. } =>
eprintln!(" [{step}/{total}] Fetching {file} from {repo}…"),
LoadProgress::Downloading { step, total, downloaded, total_bytes } => {
let pct = if *total_bytes > 0 {
(*downloaded as f64 / *total_bytes as f64 * 100.0) as u32
} else { 0 };
eprint!("\r [{step}/{total}] {pct:3}% ({:.1} / {:.1} MB)",
*downloaded as f64 / 1_048_576.0,
*total_bytes as f64 / 1_048_576.0);
let _ = io::stderr().flush();
}
LoadProgress::Loading { step, total, component } => {
eprintln!();
eprintln!(" [{step}/{total}] Loading {component}…");
}
}
})?;
eprintln!(" → codec : {}", tts.codec.backend_name());
eprintln!(" → loaded in {:.2} s", t_load.elapsed().as_secs_f32());
eprintln!();
anyhow::ensure!(codes_path.exists(), "Codes file not found: {}", codes_path.display());
eprintln!("[stream_pcm] Loading codes from {}…", codes_path.display());
let ref_codes = tts.load_ref_codes(&codes_path)?;
eprintln!(
" → {} tokens ({:.1} s of reference audio)",
ref_codes.len(),
ref_codes.len() as f32 / 50.0,
);
eprintln!();
eprintln!("[stream_pcm] Phonemizing…");
let ref_phones = neutts::phonemize::phonemize(&ref_text, "en-us")
.context("Phonemisation of ref_text failed")?;
let input_phones = neutts::phonemize::phonemize(&text, "en-us")
.context("Phonemisation of text failed")?;
let prompt = neutts::tokens::build_prompt(&ref_phones, &input_phones, &ref_codes);
eprintln!("[stream_pcm] Synthesising…");
let stdout = io::stdout();
let mut out = io::BufWriter::new(stdout.lock());
let backbone_model = &tts.backbone;
let codec = &tts.codec;
let mut pending: Vec<i32> = Vec::with_capacity(chunk_size + 8);
let mut total_samples: usize = 0;
let mut total_tokens: usize = 0;
let mut t_first_chunk: Option<f32> = None;
let t_synth = Instant::now();
backbone_model.generate_streaming(&prompt, 2048, |piece| {
let ids = neutts::tokens::extract_ids(piece);
if ids.is_empty() {
return Ok(());
}
pending.extend_from_slice(&ids);
total_tokens += ids.len();
if pending.len() < chunk_size {
return Ok(());
}
let audio = codec.decode(&pending)
.context("NeuCodec decode failed")?;
if t_first_chunk.is_none() {
let ttfa = t_synth.elapsed().as_secs_f32();
t_first_chunk = Some(ttfa);
eprintln!(
" → first audio chunk after {ttfa:.2} s \
({} tokens, {:.0} ms of audio)",
pending.len(),
pending.len() as f32 * 1000.0 / 50.0,
);
}
write_pcm_chunk(&audio, &mut out)?;
total_samples += audio.len();
pending.clear();
Ok(())
})?;
if !pending.is_empty() {
let audio = codec.decode(&pending)
.context("NeuCodec decode (tail) failed")?;
write_pcm_chunk(&audio, &mut out)?;
total_samples += audio.len();
total_tokens += pending.len();
}
out.flush().context("stdout flush failed")?;
let elapsed = t_synth.elapsed().as_secs_f32();
let audio_dur = total_samples as f32 / neutts::SAMPLE_RATE as f32;
let rtf = if audio_dur > 0.0 { elapsed / audio_dur } else { 0.0 };
eprintln!();
eprintln!(
"[stream_pcm] Done: {total_tokens} tokens → {total_samples} samples \
({audio_dur:.2} s of audio)"
);
eprintln!(
"[stream_pcm] Timing: total {elapsed:.2} s | RTF {rtf:.2}x | \
TTFA {:.2} s",
t_first_chunk.unwrap_or(elapsed),
);
Ok(())
}
fn write_pcm_chunk(samples: &[f32], out: &mut impl io::Write) -> anyhow::Result<()> {
let mut buf = vec![0u8; samples.len() * 2];
for (i, &s) in samples.iter().enumerate() {
let s16 = (s.clamp(-1.0, 1.0) * i16::MAX as f32) as i16;
let bytes = s16.to_le_bytes();
buf[i * 2] = bytes[0];
buf[i * 2 + 1] = bytes[1];
}
out.write_all(&buf).context("PCM write failed")?;
out.flush().context("PCM flush failed")?;
Ok(())
}
fn truncate(s: &str, max: usize) -> String {
let mut chars = s.chars();
let head: String = chars.by_ref().take(max).collect();
if chars.next().is_some() { format!("{head}…") } else { head }
}
fn print_help() {
eprintln!(
"stream_pcm — preload models once, stream raw PCM to stdout\n\
\n\
USAGE:\n\
\tcargo run --example stream_pcm --features espeak -- [OPTIONS] | <player>\n\
\n\
REQUIRED:\n\
\t--codes / -c PATH Pre-encoded .npy reference codes\n\
\t--text / -t TEXT Text to synthesise\n\
\n\
OPTIONS:\n\
\t--ref-text / -r TEXT Transcript of the reference recording\n\
\t (auto-detected from <codes_stem>.txt if omitted)\n\
\t--backbone / -b REPO HuggingFace backbone repo\n\
\t (default: neuphonic/neutts-nano-q4-gguf)\n\
\t--gguf-file / -g FILE Specific GGUF filename within the repo\n\
\t--chunk / -k N Tokens per decode chunk (default: 25 = 500 ms)\n\
\t--help / -h Show this help\n\
\n\
PCM FORMAT (stdout):\n\
\tSigned 16-bit little-endian, 24 000 Hz, mono\n\
\n\
PLAYBACK EXAMPLES:\n\
\t... | aplay -f S16_LE -r 24000 -c 1 # Linux\n\
\t... | sox -t raw -r 24000 -e signed -b 16 -c 1 - -d # macOS\n\
\t... | ffplay -f s16le -ar 24000 -ac 1 -nodisp - # cross-platform\n\
\n\
SAVE TO FILE:\n\
\t... > output.pcm\n\
\tsox -t raw -r 24000 -e signed -b 16 -c 1 output.pcm output.wav\n\
\tffmpeg -f s16le -ar 24000 -ac 1 -i output.pcm output.wav"
);
}