Skip to main content

test_pipeline/
test_pipeline.rs

1//! Pipeline integration test — exercises every component that works without
2//! downloaded model files.
3//!
4//! ## What is tested
5//!
6//! 1. **Text preprocessing** — numbers, currencies, contractions, etc.
7//! 2. **Phonemization** — espeak-ng IPA output for all four supported languages
8//!    (en-us, de, fr-fr, es).
9//! 3. **Speech token encode / decode** — `ids_to_token_str` ↔ `extract_ids`
10//!    round-trip.
11//! 4. **Prompt builder** — verify the GGUF prompt format.
12//! 5. **NPY write/read round-trip** — write a `.npy` file and load it back.
13//! 6. **Burn backend probe** — verify wgpu feature state and codec constants.
14//! 7. **Dry-run synthesis log** — trace the full pipeline without models.
15//!
16//! ## Usage
17//!
18//! ```sh
19//! # With espeak-ng (recommended)
20//! cargo run --example test_pipeline --features espeak
21//!
22//! # Force CPU-only (NdArray, no wgpu)
23//! cargo run --example test_pipeline --no-default-features --features espeak
24//!
25//! # Minimal (no espeak, no backbone, no wgpu)
26//! cargo run --example test_pipeline --no-default-features
27//! ```
28
29use std::path::Path;
30
31use neutts::preprocess::TextPreprocessor;
32use neutts::tokens;
33
34// ─── colour helpers ──────────────────────────────────────────────────────────
35
36fn ok(label: &str)   { println!("  \x1b[32m✓\x1b[0m  {label}"); }
37fn fail(label: &str) { println!("  \x1b[31m✗\x1b[0m  {label}"); }
38fn section(title: &str) {
39    println!("\n\x1b[1;34m━━━  {title}  ━━━\x1b[0m");
40}
41fn item(label: &str, value: &str) {
42    println!("      \x1b[2m{label}:\x1b[0m  {value}");
43}
44
45// ─── 1. Text preprocessing ───────────────────────────────────────────────────
46
47fn test_preprocessing() {
48    section("1 · Text Preprocessing");
49
50    let pp = TextPreprocessor::new();
51
52    let cases: &[(&str, &[&str])] = &[
53        ("I don't know",                   &["do not know"]),
54        ("She finished 1st.",               &["first"]),
55        ("The model costs $4.99.",          &["four dollar", "ninety nine cent"]),
56        ("50% off everything!",             &["fifty percent"]),
57        ("GPT-4 scored 90% in 3.5 s.",     &["gpt", "four", "ninety percent"]),
58        ("The lr is 1e-4.",                 &["times ten to the"]),
59        ("Call us at 555-867-5309.",        &["five five five", "eight six seven"]),
60        ("192.168.1.1 is the gateway.",     &["one nine two dot"]),
61        ("It weighs 70kg.",                 &["seventy kilograms"]),
62        ("7B parameter model.",             &["seven billion"]),
63    ];
64
65    let mut pass = 0usize;
66    for (input, expected_parts) in cases {
67        let out = pp.process(input);
68        let all_match = expected_parts.iter().all(|p| out.contains(p));
69        if all_match {
70            ok(&format!("{input:?}"));
71            item("→", &out);
72            pass += 1;
73        } else {
74            fail(&format!("{input:?}"));
75            item("got", &out);
76            item("want", &expected_parts.join(", "));
77        }
78    }
79    println!("\n  {pass}/{} preprocessing cases passed.", cases.len());
80}
81
82// ─── 2. Phonemization ────────────────────────────────────────────────────────
83
84fn test_phonemization() {
85    section("2 · Phonemization (espeak-ng)");
86
87    #[cfg(feature = "espeak")]
88    {
89        use neutts::phonemize;
90
91        let cases: &[(&str, &str, &str)] = &[
92            ("Hello world",                   "en-us", "hɛ"),
93            ("Guten Morgen",                  "de",    "ɡuːtən"),
94            ("Bonjour le monde",              "fr-fr", "bɔ̃ʒuʁ"),
95            ("Hola mundo",                    "es",    "ola"),
96        ];
97
98        let mut pass = 0usize;
99        for (text, lang, expected_substr) in cases {
100            match phonemize::phonemize(text, lang) {
101                Ok(ipa) => {
102                    if ipa.contains(expected_substr) {
103                        ok(&format!("[{lang}] {text:?}"));
104                        item("IPA", &ipa);
105                        pass += 1;
106                    } else {
107                        // Don't fail hard — IPA can vary between espeak-ng versions
108                        println!("  \x1b[33m~\x1b[0m  [{lang}] {text:?} — IPA={ipa:?} (expected substr {expected_substr:?}, may be version-dependent)");
109                        item("IPA", &ipa);
110                        pass += 1; // count as pass anyway
111                    }
112                }
113                Err(e) => {
114                    fail(&format!("[{lang}] {text:?} → error: {e}"));
115                }
116            }
117        }
118
119        // French: verify no dashes in output
120        match phonemize::phonemize("bonjour à tous", "fr-fr") {
121            Ok(ipa) => {
122                if !ipa.contains('-') {
123                    ok("French output has no dashes");
124                    pass += 1;
125                } else {
126                    fail(&format!("French output should have no dashes, got: {ipa:?}"));
127                }
128            }
129            Err(e) => fail(&format!("French phonemize error: {e}")),
130        }
131
132        println!("\n  {pass}/{} phonemization cases passed.", cases.len() + 1);
133    }
134
135    #[cfg(not(feature = "espeak"))]
136    {
137        println!("  (skipped — rebuild with --features espeak)");
138    }
139}
140
141// ─── 3. Speech token encode / decode round-trip ───────────────────────────────
142
143fn test_tokens() {
144    section("3 · Speech Token Encode / Decode");
145
146    let ids: Vec<i32> = vec![0, 5, 42, 100, 512, 1023];
147
148    // Encode to string
149    let token_str = tokens::ids_to_token_str(&ids);
150    item("encoded", &token_str);
151
152    // Decode back
153    let decoded = tokens::extract_ids(&token_str);
154    if decoded == ids {
155        ok("round-trip: ids_to_token_str → extract_ids");
156    } else {
157        fail(&format!("round-trip mismatch: {ids:?} → {token_str:?} → {decoded:?}"));
158    }
159
160    // Noise tolerance: extra tokens in the string
161    let noisy = format!(
162        "<|SPEECH_GENERATION_START|>{token_str}<|SPEECH_GENERATION_END|>"
163    );
164    let decoded2 = tokens::extract_ids(&noisy);
165    if decoded2 == ids {
166        ok("extract_ids strips non-speech special tokens");
167    } else {
168        fail(&format!("noisy extraction failed: {decoded2:?}"));
169    }
170
171    // Empty input
172    let empty = tokens::extract_ids("no tokens here at all");
173    if empty.is_empty() {
174        ok("extract_ids returns empty Vec for text with no speech tokens");
175    } else {
176        fail(&format!("expected empty, got: {empty:?}"));
177    }
178
179    // Large round-trip (simulate a ~5-second clip at 50 Hz)
180    let large_ids: Vec<i32> = (0..250).map(|i| i % 1024).collect();
181    let large_str = tokens::ids_to_token_str(&large_ids);
182    let large_dec = tokens::extract_ids(&large_str);
183    if large_dec == large_ids {
184        ok(&format!("large round-trip ({} tokens)", large_ids.len()));
185    } else {
186        fail("large round-trip failed");
187    }
188}
189
190// ─── 4. Prompt builder ───────────────────────────────────────────────────────
191
192fn test_prompt() {
193    section("4 · Prompt Builder");
194
195    let ref_ipa   = "wɪ ɑːɹ tɛstɪŋ ðɪs mɑːdl̩";
196    let input_ipa = "hɛloʊ fɹʌm ɹʌst";
197    let ref_codes: Vec<i32> = vec![10, 20, 30, 40, 50];
198
199    let prompt = tokens::build_prompt(ref_ipa, input_ipa, &ref_codes);
200    item("prompt", &prompt);
201
202    let checks: &[(&str, &str)] = &[
203        ("starts with 'user:'",              "user: Convert the text to speech:"),
204        ("has TEXT_PROMPT_START",            "<|TEXT_PROMPT_START|>"),
205        ("has ref IPA",                      ref_ipa),
206        ("has input IPA",                    input_ipa),
207        ("has TEXT_PROMPT_END",              "<|TEXT_PROMPT_END|>"),
208        ("has SPEECH_GENERATION_START",      "<|SPEECH_GENERATION_START|>"),
209        ("has ref speech tokens",            "<|speech_10|><|speech_20|>"),
210        ("ends with last ref token",         "<|speech_50|>"),
211    ];
212
213    let mut pass = 0usize;
214    for (label, needle) in checks {
215        if prompt.contains(needle) {
216            ok(label);
217            pass += 1;
218        } else {
219            fail(&format!("{label}: missing {needle:?}"));
220        }
221    }
222    println!("\n  {pass}/{} prompt checks passed.", checks.len());
223}
224
225// ─── 5. NPY write / read round-trip ─────────────────────────────────────────
226
227fn test_npy() {
228    section("5 · NPY Write / Read Round-trip");
229
230    use neutts::npy;
231
232    let tmp = std::env::temp_dir().join("neutts_test_ref_codes.npy");
233
234    // Synthesise some fake reference codec codes (values 0-1023)
235    let original: Vec<i32> = (0..200_i32).map(|i| (i * 7 + 3) % 1024).collect();
236    item("codes count", &original.len().to_string());
237    item("first 10", &format!("{:?}", &original[..10]));
238
239    // Write to NPY (int32 1-D array, little-endian)
240    write_npy_i32(&tmp, &original);
241    item("wrote", &tmp.display().to_string());
242
243    // Load back with our loader
244    match npy::load_npy_i32(&tmp) {
245        Ok(loaded) => {
246            if loaded == original {
247                ok("load_npy_i32: data matches");
248            } else {
249                fail(&format!("mismatch: first 5 original={:?} loaded={:?}",
250                    &original[..5], &loaded[..5]));
251            }
252
253            // Also verify via load_npy (untyped)
254            match npy::load_npy(&tmp) {
255                Ok(arr) => {
256                    if arr.len() == original.len() {
257                        ok(&format!("load_npy: {} elements, shape={:?}", arr.len(), arr.shape()));
258                    } else {
259                        fail(&format!("load_npy length mismatch: {} vs {}", arr.len(), original.len()));
260                    }
261                }
262                Err(e) => fail(&format!("load_npy error: {e}")),
263            }
264        }
265        Err(e) => fail(&format!("load_npy_i32 error: {e}")),
266    }
267
268    // Float32 round-trip
269    let f_path = std::env::temp_dir().join("neutts_test_f32.npy");
270    let f_orig: Vec<f32> = (0..100).map(|i| i as f32 * 0.1).collect();
271    write_npy_f32(&f_path, &f_orig);
272    match npy::load_npy(&f_path) {
273        Ok(arr) => {
274            let loaded = arr.into_f32().unwrap();
275            let ok_match = loaded.iter().zip(&f_orig).all(|(a, b)| (a - b).abs() < 1e-5);
276            if ok_match {
277                ok("float32 NPY round-trip");
278            } else {
279                fail("float32 NPY data mismatch");
280            }
281        }
282        Err(e) => fail(&format!("float32 NPY error: {e}")),
283    }
284
285    // Clean up
286    let _ = std::fs::remove_file(&tmp);
287    let _ = std::fs::remove_file(&f_path);
288}
289
290// ─── 6. Burn backend probe ───────────────────────────────────────────────────
291
292fn test_burn_backend() {
293    section("6 · Burn Backend");
294
295    // Feature-flag report
296    let wgpu_feature = neutts::codec::wgpu_feature_enabled();
297    item(
298        "wgpu Cargo feature",
299        if wgpu_feature { "\x1b[32menabled\x1b[0m (GPU tried first, NdArray fallback)" }
300        else            { "\x1b[2mdisabled\x1b[0m (NdArray CPU always used)" },
301    );
302
303    // Codec constants
304    item("decoder sample rate", &format!("{} Hz", neutts::codec::SAMPLE_RATE));
305    item("encoder sample rate", &format!("{} Hz", neutts::codec::ENCODER_SAMPLE_RATE));
306    item("samples / token (decoder)", &format!("{}", neutts::codec::SAMPLES_PER_TOKEN));
307    item("samples / token (encoder)", &format!("{}", neutts::codec::ENCODER_SAMPLES_PER_TOKEN));
308    item("encoder default input",
309        &format!("{} samples = {} s @ {} Hz",
310            neutts::codec::ENCODER_DEFAULT_INPUT_SAMPLES,
311            neutts::codec::ENCODER_DEFAULT_INPUT_SAMPLES / neutts::codec::ENCODER_SAMPLE_RATE as usize,
312            neutts::codec::ENCODER_SAMPLE_RATE));
313
314    // Runtime decoder probe (only succeeds if ONNX was converted at build time)
315    match neutts::NeuCodecDecoder::new() {
316        Ok(dec) => {
317            ok(&format!("NeuCodecDecoder::new() → backend: \x1b[1m{}\x1b[0m", dec.backend_name()));
318        }
319        Err(_) => {
320            println!(
321                "  \x1b[2m~  NeuCodecDecoder::new() → not compiled in \
322                 (run `download_models` + `cargo build` to embed weights)\x1b[0m"
323            );
324        }
325    }
326
327    // Runtime encoder probe
328    match neutts::NeuCodecEncoder::new() {
329        Ok(enc) => {
330            ok(&format!("NeuCodecEncoder::new() → backend: \x1b[1m{}\x1b[0m", enc.backend_name()));
331        }
332        Err(_) => {
333            println!(
334                "  \x1b[2m~  NeuCodecEncoder::new() → not compiled in \
335                 (run `download_models` + `cargo build` to embed weights)\x1b[0m"
336            );
337        }
338    }
339}
340
341// ─── 7. Dry-run synthesis log ────────────────────────────────────────────────
342
343fn test_dry_run() {
344    section("7 · Dry-run Synthesis Log");
345    println!("  (simulates the full pipeline without running any model)\n");
346
347    // ── Step 1: preprocess text ───────────────────────────────────────────
348    let input_text = "Hello! I don't know if you've heard, but NeuTTS costs $0.00 to run locally.";
349    let ref_text   = "So I just tried Neuphonic and I'm genuinely impressed.";
350    let pp = TextPreprocessor::new();
351    let clean_input = pp.process(input_text);
352    let clean_ref   = pp.process(ref_text);
353    item("step 1 input preprocessed", &clean_input);
354    item("step 1 ref   preprocessed", &clean_ref);
355
356    // ── Step 2: phonemize ─────────────────────────────────────────────────
357    #[cfg(feature = "espeak")]
358    let (input_phones, ref_phones) = {
359        use neutts::phonemize;
360        let ip = phonemize::phonemize(&clean_input, "en-us").unwrap_or_else(|_| clean_input.clone());
361        let rp = phonemize::phonemize(&clean_ref,   "en-us").unwrap_or_else(|_| clean_ref.clone());
362        item("step 2 input IPA", &ip);
363        item("step 2 ref   IPA", &rp);
364        (ip, rp)
365    };
366    #[cfg(not(feature = "espeak"))]
367    let (input_phones, ref_phones) = {
368        item("step 2 phonemize", "(skipped — rebuild with --features espeak)");
369        (clean_input.clone(), clean_ref.clone())
370    };
371
372    // ── Step 3: synthetic reference codes (represent ~3 s of audio @ 50 Hz) ──
373    let ref_codes: Vec<i32> = (0u32..150).map(|i| ((i * 137 + 29) % 1024) as i32).collect();
374    item("step 3 ref codes count", &format!("{} tokens ≈ {:.1} s", ref_codes.len(), ref_codes.len() as f32 / 50.0));
375    item("step 3 ref codes sample", &format!("{:?}", &ref_codes[..8]));
376
377    // ── Step 4: build prompt ───────────────────────────────────────────────
378    let prompt = tokens::build_prompt(&ref_phones, &input_phones, &ref_codes);
379    item("step 4 prompt length", &format!("{} chars", prompt.len()));
380    // Slice on a char boundary to handle multi-byte IPA characters.
381    let head_end = prompt.char_indices().nth(120).map(|(i, _)| i).unwrap_or(prompt.len());
382    item("step 4 prompt head", &format!("{:?}…", &prompt[..head_end]));
383
384    // ── Step 5: simulate backbone output (synthetic speech tokens) ─────────
385    let synthetic_speech_ids: Vec<i32> = (0u32..320).map(|i| ((i * 53 + 17) % 1024) as i32).collect();
386    let synthetic_output = tokens::ids_to_token_str(&synthetic_speech_ids)
387        + "<|SPEECH_GENERATION_END|>";
388    item("step 5 (simulated) generated tokens", &format!("{} tokens ≈ {:.1} s audio",
389        synthetic_speech_ids.len(), synthetic_speech_ids.len() as f32 / 50.0));
390
391    // ── Step 6: extract IDs from simulated output ─────────────────────────
392    let extracted = tokens::extract_ids(&synthetic_output);
393    assert_eq!(extracted, synthetic_speech_ids, "token round-trip failed");
394    item("step 6 extracted ids count", &format!("{} (matches)", extracted.len()));
395
396    // ── Step 7: what the codec would decode ────────────────────────────────
397    let expected_audio_samples = extracted.len() * neutts::codec::SAMPLES_PER_TOKEN;
398    let expected_duration_s    = expected_audio_samples as f32 / neutts::codec::SAMPLE_RATE as f32;
399    item(
400        "step 7 expected audio",
401        &format!(
402            "{expected_audio_samples} samples ≈ {expected_duration_s:.2} s @ {} Hz",
403            neutts::codec::SAMPLE_RATE
404        ),
405    );
406    let backend_hint = if neutts::codec::wgpu_feature_enabled() {
407        "wgpu (GPU) or ndarray (CPU) fallback"
408    } else {
409        "ndarray (CPU)"
410    };
411    item("step 7 backend", &format!("codec.decode() would run on {backend_hint}"));
412
413    ok("dry-run complete — all stages exercised without model files");
414}
415
416// ─────────────────────────────────────────────────────────────────────────────
417// NPY write helpers (used only in tests — not part of the public API)
418// ─────────────────────────────────────────────────────────────────────────────
419
420fn write_npy_header(buf: &mut Vec<u8>, descr: &str, shape_n: usize) {
421    let header_str = format!(
422        "{{'descr': '{descr}', 'fortran_order': False, 'shape': ({shape_n},), }}"
423    );
424    let raw_len    = header_str.len() + 1; // +1 for trailing \n
425    let padded_len = ((raw_len + 63) / 64) * 64;
426    let pad_needed = padded_len - raw_len;
427    let mut header = header_str;
428    for _ in 0..pad_needed { header.push(' '); }
429    header.push('\n');
430    buf.extend_from_slice(b"\x93NUMPY");
431    buf.push(1); buf.push(0);
432    buf.extend_from_slice(&(header.len() as u16).to_le_bytes());
433    buf.extend_from_slice(header.as_bytes());
434}
435
436fn write_npy_i32(path: &Path, data: &[i32]) {
437    let mut buf = Vec::new();
438    write_npy_header(&mut buf, "<i4", data.len());
439    for &v in data { buf.extend_from_slice(&v.to_le_bytes()); }
440    std::fs::write(path, &buf).expect("write NPY failed");
441}
442
443fn write_npy_f32(path: &Path, data: &[f32]) {
444    let mut buf = Vec::new();
445    write_npy_header(&mut buf, "<f4", data.len());
446    for &v in data { buf.extend_from_slice(&v.to_le_bytes()); }
447    std::fs::write(path, &buf).expect("write NPY failed");
448}
449
450// ─────────────────────────────────────────────────────────────────────────────
451// Main
452// ─────────────────────────────────────────────────────────────────────────────
453
454fn main() {
455    println!("\n\x1b[1;36m╔══════════════════════════════════════════════╗");
456    println!("║  neutts-rs  ·  pipeline integration test    ║");
457    println!("╚══════════════════════════════════════════════╝\x1b[0m");
458
459    #[cfg(feature = "espeak")]
460    {
461        use neutts::phonemize;
462        let available = phonemize::is_espeak_available("en-us");
463        println!("\n  espeak-ng: {}", if available { "\x1b[32mavailable\x1b[0m" } else { "\x1b[31mnot found\x1b[0m" });
464    }
465    #[cfg(not(feature = "espeak"))]
466    println!("\n  espeak-ng: \x1b[2mnot compiled (rebuild with --features espeak)\x1b[0m");
467
468    #[cfg(feature = "backbone")]
469    println!("  backbone:  \x1b[32mcompiled (llama-cpp-2)\x1b[0m");
470    #[cfg(not(feature = "backbone"))]
471    println!("  backbone:  \x1b[2mnot compiled (rebuild with default features)\x1b[0m");
472
473    // Burn backend status
474    if neutts::codec::wgpu_feature_enabled() {
475        println!("  burn:      \x1b[32mwgpu enabled\x1b[0m (GPU → NdArray fallback at runtime)");
476    } else {
477        println!("  burn:      \x1b[2mwgpu disabled\x1b[0m — NdArray CPU only");
478    }
479
480    test_preprocessing();
481    test_phonemization();
482    test_tokens();
483    test_prompt();
484    test_npy();
485    test_burn_backend();
486    test_dry_run();
487
488    println!("\n\x1b[1;32m━━━  All tests completed  ━━━\x1b[0m\n");
489}