espeak-ng 0.1.1

Pure Rust port of eSpeak NG text-to-speech
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
//! Cascade formant synthesizer for the generic IPA → PCM path.

// src/synthesize/engine.rs
//
// Cascade formant synthesizer.
//
// Architecture:
//   1. Parse the IPA string into a sequence of `Segment`s (phoneme + timing).
//   2. For each segment, linearly interpolate the three formant frequencies and
//      bandwidths from the previous segment's targets to the current targets.
//      Resonator coefficients are recomputed every STEPSIZE samples (64 samples
//      ≈ 2.9 ms @ 22 050 Hz) to avoid per-sample transcendental-function calls.
//   3. The source signal is a mixture of:
//        • Voiced:   a shaped glottal pulse train at F0 (from VoiceParams).
//        • Unvoiced: white noise via a xorshift PRNG.
//      The mixing ratio is controlled by the phoneme's `voiced_frac` /
//      `noise_frac` fields.
//   4. The source is filtered through a cascade of three second-order IIR
//      resonators (F1, F2, F3).  The resonator state persists across phoneme
//      boundaries, giving smooth formant transitions automatically.
//   5. Output is scaled and hard-clipped to i16 range.  A final peak-limiter
//      step prevents distortion on the loudest segments.

use std::f64::consts::PI;

use super::targets::{FormantTarget, SILENCE, match_ipa};
use super::{Resonator, VoiceParams, PcmBuffer};

// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------

/// Synthesis sample rate (Hz).
pub const SAMPLE_RATE: f64 = 22_050.0;

/// Coefficient update interval (samples).  Matches espeak-ng's `STEPSIZE`.
const STEPSIZE: usize = 64;

// ---------------------------------------------------------------------------
// Resonator coefficient helpers
// ---------------------------------------------------------------------------

/// Compute second-order IIR resonator coefficients (Klatt 1980).
///
/// Given centre frequency `f` (Hz) and bandwidth `bw` (Hz) at sample rate
/// `fs` (Hz), returns the `(a, b, c)` coefficients for:
/// ```text
///   y[n] = a·x[n] + b·y[n-1] + c·y[n-2]
/// ```
/// where a = 1 − B − C, B = 2·r·cos(2π·f/fs), C = −r².
#[inline]
fn resonator_coeffs(f: f64, bw: f64, fs: f64) -> (f64, f64, f64) {
    // Clamp to safe range: keep f well below Nyquist and bw positive.
    let f  = f.clamp(50.0, fs * 0.49);
    let bw = bw.clamp(10.0, fs * 0.25);

    let r = (-PI * bw / fs).exp();
    let c = -(r * r);
    let b = 2.0 * r * (2.0 * PI * f / fs).cos();
    let a = 1.0 - b - c;
    (a, b, c)
}

/// Set all fields of a `Resonator` from (f, bw, fs).
#[inline]
fn set_resonator(r: &mut Resonator, f: f64, bw: f64, fs: f64) {
    let (a, b, c) = resonator_coeffs(f, bw, fs);
    r.a = a; r.b = b; r.c = c;
}

// ---------------------------------------------------------------------------
// Segment — one phoneme with timing
// ---------------------------------------------------------------------------

/// A parsed phoneme event ready for synthesis.
#[derive(Debug, Clone)]
pub struct Segment {
    /// Acoustic target for this phoneme.
    pub target: FormantTarget,
    /// Duration in samples (already rate-adjusted).
    pub dur_samples: usize,
    /// Amplitude multiplier (1.0 = normal; > 1.0 = stressed).
    pub amp_factor: f64,
}

// ---------------------------------------------------------------------------
// IPA parser
// ---------------------------------------------------------------------------

/// Parse an IPA string produced by `Translator::text_to_ipa()` into a
/// sequence of `Segment`s suitable for synthesis.
///
/// Handled characters:
/// * `ˈ` (U+02C8) primary stress   → `amp_factor = 1.3` on next phoneme.
/// * `ˌ` (U+02CC) secondary stress → `amp_factor = 1.1` on next phoneme.
/// * `ː` (U+02D0) length mark      → should have been consumed as part of
///   a long-vowel digraph (e.g. "iː").  If seen naked, ignored.
/// * ` ` (ASCII space)             → short inter-word pause.
/// * Everything else → longest-prefix match against `IPA_TARGETS`.
pub fn parse_ipa(ipa: &str, voice: &VoiceParams) -> Vec<Segment> {
    let mut segments = Vec::new();
    let mut s = ipa;
    // Speed: 100 = normal (1×), 200 = double speed (÷2), 50 = half speed (×2).
    let speed_factor = 100.0 / voice.speed_percent.max(1) as f64;
    let mut pending_amp = 1.0_f64;

    while !s.is_empty() {
        // ── Stress marks ────────────────────────────────────────────────────
        if let Some(rest) = s.strip_prefix('ˈ') {   // primary stress U+02C8
            pending_amp = 1.3;
            s = rest;
            continue;
        }
        if let Some(rest) = s.strip_prefix('ˌ') {   // secondary stress U+02CC
            pending_amp = 1.1;
            s = rest;
            continue;
        }
        // ── Stray length mark (shouldn't appear outside a long-vowel digraph)
        if let Some(rest) = s.strip_prefix('ː') {   // U+02D0
            s = rest;
            continue;
        }
        // ── Inter-word space → very short silence ──────────────────────────
        if let Some(rest) = s.strip_prefix(' ') {
            let dur_ms = 60.0 * speed_factor;
            segments.push(Segment {
                target: SILENCE,
                dur_samples: ms_to_samples(dur_ms),
                amp_factor: 1.0,
            });
            s = rest;
            pending_amp = 1.0;
            continue;
        }

        // ── Try longest-prefix IPA match ────────────────────────────────────
        if let Some((target, consumed)) = match_ipa(s) {
            let dur_ms = target.dur_ms * speed_factor;
            segments.push(Segment {
                target: *target,
                dur_samples: ms_to_samples(dur_ms),
                amp_factor: pending_amp,
            });
            pending_amp = 1.0;
            s = &s[consumed..];
        } else {
            // Unknown IPA character — skip it.
            let c = s.chars().next().unwrap();
            s = &s[c.len_utf8()..];
        }
    }

    segments
}

#[inline]
fn ms_to_samples(ms: f64) -> usize {
    ((ms / 1000.0) * SAMPLE_RATE).max(1.0).round() as usize
}

// ---------------------------------------------------------------------------
// Xorshift32 PRNG (fast white noise)
// ---------------------------------------------------------------------------

struct Xorshift32(u32);

impl Xorshift32 {
    fn new() -> Self { Xorshift32(0xBAD_5EED) }

    /// Next sample in −1.0 … +1.0.
    #[inline]
    fn next_f64(&mut self) -> f64 {
        self.0 ^= self.0 << 13;
        self.0 ^= self.0 >> 17;
        self.0 ^= self.0 << 5;
        (self.0 as i32 as f64) * (1.0 / 2_147_483_648.0)
    }
}

// ---------------------------------------------------------------------------
// Glottal pulse model
// ---------------------------------------------------------------------------

/// Generate one sample of a shaped glottal waveform.
///
/// `phase` ∈ [0, 1) is the fractional position within the F0 cycle.
/// The shape is a quasi-sinusoidal pulse:
///   * Opening phase (0.0 – 0.65): raised-cosine rise.
///   * Closing phase (0.65 – 1.0): sharper fall (glottal closure).
///
/// The DC component is zero (no net force), and the waveform has the
/// spectral tilt of a natural voice (6 dB/octave slope).
#[inline]
fn glottal_sample(phase: f64) -> f64 {
    let p = phase;
    if p < 0.65 {
        // Smooth opening: half of a cosine (−1 → +1 over 0.65 periods)
        let t = p / 0.65;
        (PI * t - PI * 0.5).sin()           // = −cos(π·t), range −1..+1
    } else {
        // Sharp glottal closure: linear fall
        let t = (p - 0.65) / 0.35;
        1.0 - 2.0 * t                        // +1 → −1
    }
}

// ---------------------------------------------------------------------------
// Core synthesis loop
// ---------------------------------------------------------------------------

/// Synthesize a sequence of `Segment`s to a `PcmBuffer`.
///
/// # Algorithm
/// For each segment the formant frequencies and bandwidths are linearly
/// interpolated from the *previous* segment's values to the *current*
/// segment's values.  Resonator coefficients are updated every `STEPSIZE`
/// samples.  The three resonators run in cascade (F1 → F2 → F3).
pub fn synthesize_segments(segments: &[Segment], voice: &VoiceParams) -> PcmBuffer {
    let fs = SAMPLE_RATE;
    let f0 = voice.pitch_hz.max(50) as f64;
    let global_amp = voice.amplitude.clamp(0, 100) as f64 / 100.0;

    // Total sample count (pre-allocate)
    let total: usize = segments.iter().map(|s| s.dur_samples).sum();
    let mut output: Vec<f64> = Vec::with_capacity(total);

    // Resonator filter state — persists across segments for smooth transitions.
    let mut r1 = Resonator::default();
    let mut r2 = Resonator::default();
    let mut r3 = Resonator::default();

    // Initialise resonators to neutral (schwa-like) values.
    set_resonator(&mut r1, 500.0, 150.0, fs);
    set_resonator(&mut r2, 1500.0, 200.0, fs);
    set_resonator(&mut r3, 2500.0, 300.0, fs);

    let mut rng = Xorshift32::new();

    // Glottal oscillator phase, 0.0..1.0 per cycle.
    let mut phase = 0.0_f64;
    let phase_inc = f0 / fs;

    // Previous segment's formant targets (for interpolation).
    let mut prev_f1 = 500.0_f64;
    let mut prev_f2 = 1500.0_f64;
    let mut prev_f3 = 2500.0_f64;
    let mut prev_bw1 = 150.0_f64;
    let mut prev_bw2 = 200.0_f64;
    let mut prev_bw3 = 300.0_f64;

    for seg in segments {
        let n = seg.dur_samples;
        if n == 0 { continue; }

        let tgt = &seg.target;
        // Effective per-phoneme amplitude.
        let seg_amp = tgt.amp * seg.amp_factor * global_amp;

        let voiced = tgt.voiced_frac;
        let noise  = tgt.noise_frac;

        // Formant endpoints for this segment.
        let to_f1  = tgt.f1;  let to_f2  = tgt.f2;  let to_f3  = tgt.f3;
        let to_bw1 = tgt.bw1; let to_bw2 = tgt.bw2; let to_bw3 = tgt.bw3;

        let mut step_start = 0usize;

        while step_start < n {
            let step_end = (step_start + STEPSIZE).min(n);
            let step_len = step_end - step_start;

            // Interpolation factor at the midpoint of this step.
            let t_mid = (step_start as f64 + step_len as f64 * 0.5) / n as f64;

            // Interpolate formant parameters and update resonator coefficients.
            let f1  = prev_f1  + (to_f1  - prev_f1)  * t_mid;
            let f2  = prev_f2  + (to_f2  - prev_f2)  * t_mid;
            let f3  = prev_f3  + (to_f3  - prev_f3)  * t_mid;
            let bw1 = prev_bw1 + (to_bw1 - prev_bw1) * t_mid;
            let bw2 = prev_bw2 + (to_bw2 - prev_bw2) * t_mid;
            let bw3 = prev_bw3 + (to_bw3 - prev_bw3) * t_mid;

            set_resonator(&mut r1, f1, bw1, fs);
            set_resonator(&mut r2, f2, bw2, fs);
            set_resonator(&mut r3, f3, bw3, fs);

            // Run the synthesizer for this step.
            for i_rel in 0..step_len {
                let i_abs = step_start + i_rel;

                // ── Source signal ────────────────────────────────────────────
                let voiced_src = if voiced > 0.0 {
                    glottal_sample(phase) * voiced
                } else {
                    0.0
                };

                let noise_src = if noise > 0.0 {
                    rng.next_f64() * noise
                } else {
                    0.0
                };

                let source = voiced_src + noise_src;

                // ── Amplitude envelope (4-ms fade-in / fade-out) ─────────────
                let fade_len = (0.004 * fs) as usize;
                let env = if i_abs < fade_len {
                    i_abs as f64 / fade_len as f64
                } else if i_abs >= n.saturating_sub(fade_len) {
                    (n.saturating_sub(i_abs)) as f64 / fade_len as f64
                } else {
                    1.0
                };

                let x = source * env;

                // ── Cascade resonator filter: F1 → F2 → F3 ──────────────────
                let y = r3.tick(r2.tick(r1.tick(x)));

                output.push(y * seg_amp);

                // Advance glottal oscillator phase.
                phase = (phase + phase_inc).fract();
            }

            step_start = step_end;
        }

        // Update previous targets for the next segment.
        prev_f1 = to_f1; prev_f2 = to_f2; prev_f3 = to_f3;
        prev_bw1 = to_bw1; prev_bw2 = to_bw2; prev_bw3 = to_bw3;
    }

    // ── Normalise & convert to i16 ───────────────────────────────────────────
    f64_to_i16(&output)
}

/// Normalise a `f64` sample buffer and convert to `i16`.
///
/// The peak is found; if it exceeds a safe threshold the entire buffer is
/// scaled down to 90 % of full scale.  Very quiet outputs are left as-is
/// (they will just be quiet).
fn f64_to_i16(samples: &[f64]) -> PcmBuffer {
    if samples.is_empty() {
        return Vec::new();
    }

    let peak = samples.iter().fold(0.0_f64, |m, &x| m.max(x.abs()));

    // Target: 90 % of full scale (≈ 29 490).
    let target_peak = 0.90 * 32_767.0;
    let scale = if peak > 1e-6 { target_peak / peak } else { 0.0 };

    samples.iter()
        .map(|&x| (x * scale).clamp(-32_767.0, 32_767.0) as i16)
        .collect()
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;
    use crate::synthesize::VoiceParams;

    fn default_voice() -> VoiceParams { VoiceParams::default() }

    // ── parse_ipa ────────────────────────────────────────────────────────────

    #[test]
    fn parse_simple_vowel() {
        let segs = parse_ipa("ə", &default_voice());
        assert_eq!(segs.len(), 1);
        assert!((segs[0].target.f1 - 500.0).abs() < 1.0);
    }

    #[test]
    fn parse_long_vowel() {
        // "iː" should match as a single long-vowel segment.
        let segs = parse_ipa("", &default_voice());
        assert_eq!(segs.len(), 1, "long vowel = one segment");
        assert!(segs[0].dur_samples > ms_to_samples(100.0),
            "long vowel must be longer than 100 ms");
    }

    #[test]
    fn parse_stress_increases_amp() {
        let segs_plain    = parse_ipa("ə",  &default_voice());
        let segs_stressed = parse_ipa("ˈə", &default_voice());
        assert!(segs_stressed[0].amp_factor > segs_plain[0].amp_factor);
    }

    #[test]
    fn parse_word_boundary_inserts_pause() {
        // "biː hiː" → b, iː, [space], h, iː
        let segs = parse_ipa("biː hiː", &default_voice());
        // At least one segment should be silent (space).
        assert!(segs.iter().any(|s| s.target.amp == 0.0),
            "inter-word space must produce a silent segment");
    }

    #[test]
    fn parse_unknown_char_skipped() {
        // Emoji has no formant match — must be skipped without panic.
        let segs = parse_ipa("ə☺ə", &default_voice());
        // Should get two schwa segments, ignoring ☺.
        assert!(segs.len() >= 2);
    }

    // ── resonator_coeffs ────────────────────────────────────────────────────

    #[test]
    fn resonator_unit_dc_gain() {
        // A resonator's DC gain (input = constant 1.0) should converge to 1.0.
        let (a, b, c) = resonator_coeffs(500.0, 100.0, 22050.0);
        // DC gain = A / (1 − B − C) = A / (1 − B − C)
        // We check A / (1 - B - C) ≈ 1.0
        let dc_gain = a / (1.0 - b - c);
        assert!((dc_gain - 1.0).abs() < 1e-6, "dc_gain = {dc_gain}");
    }

    #[test]
    fn resonator_safe_clamp_extreme_freq() {
        // Should not panic for out-of-range frequency values.
        let (a, b, c) = resonator_coeffs(0.0, 0.0, 22050.0);
        assert!(a.is_finite());
        assert!(b.is_finite());
        assert!(c.is_finite());
    }

    // ── synthesize_segments ──────────────────────────────────────────────────

    #[test]
    fn synthesize_empty_gives_empty() {
        let out = synthesize_segments(&[], &default_voice());
        assert!(out.is_empty());
    }

    #[test]
    fn synthesize_produces_correct_length() {
        let segs = parse_ipa("ə", &default_voice());
        let expected = segs[0].dur_samples;
        let out = synthesize_segments(&segs, &default_voice());
        assert_eq!(out.len(), expected);
    }

    #[test]
    fn synthesize_vowel_nonzero() {
        // A voiced vowel must produce non-zero audio.
        let segs = parse_ipa("", &default_voice());
        let out = synthesize_segments(&segs, &default_voice());
        let max = out.iter().map(|&s| s.unsigned_abs()).max().unwrap_or(0);
        assert!(max > 1000, "voiced vowel must produce non-trivial audio, got peak {max}");
    }

    #[test]
    fn synthesize_silence_is_zero() {
        use super::Segment;
        let segs = vec![Segment {
            target: SILENCE,
            dur_samples: 1000,
            amp_factor: 1.0,
        }];
        let out = synthesize_segments(&segs, &default_voice());
        assert!(out.iter().all(|&s| s == 0), "silence segment must output zeros");
    }

    #[test]
    fn synthesize_peak_within_i16_range() {
        let segs = parse_ipa("ˈhɛloʊ", &default_voice());
        let out = synthesize_segments(&segs, &default_voice());
        // The synthesizer clamps to ±32767; no sample should be i16::MIN.
        assert!(out.iter().all(|&s| s >= i16::MIN + 1),
            "unexpected i16::MIN in output");
    }

    // ── glottal_sample ───────────────────────────────────────────────────────

    #[test]
    fn glottal_continuity() {
        // The glottal waveform must be continuous: no jump > 0.1 between
        // adjacent phase steps of 1/22050 of a cycle.
        let n = 22050;
        let mut prev = glottal_sample(0.0);
        for i in 1..n {
            let phase = i as f64 / n as f64;
            let cur = glottal_sample(phase);
            let delta = (cur - prev).abs();
            // Large threshold here because the glottal closure can be steep.
            assert!(delta < 0.5, "discontinuity at phase {phase:.4}: Δ = {delta:.4}");
            prev = cur;
        }
    }
}