espeak-ng 0.1.1

Pure Rust port of eSpeak NG text-to-speech
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
//! Harmonic additive waveform synthesizer — port of `wavegen.c`.
//!
//! Implements [`wavegen_segment`] (single inter-frame transition) and
//! [`synthesize_frames`] (full [`SpectSeq`] → raw `i32` samples).
//!
//! [`SpectSeq`]: super::phondata::SpectSeq
//
// ## Algorithm
//
// eSpeak NG uses *additive harmonic synthesis* to generate audio from formant
// data.  The key steps (mirroring `Wavegen()` + `PeaksToHarmspect()` in the C
// source) are:
//
//   1. For each pair of consecutive spectral frames (fr1, fr2) covering `length`
//      samples:
//      a. Compute formant peaks from the frame (ffreq, fheight, fwidth).
//      b. Build a harmonic spectrum table (`htab`) using `PeaksToHarmspect`:
//         for every harmonic h=1…hmax, sum the contributions from all peaks
//         using the peak-shape table.
//      c. Every STEPSIZE (64) samples, re-interpolate fr1→fr2 and rebuild htab.
//      d. For each sample: sum sin_tab[h·waveph >> 5] * htab[h] over all
//         harmonics.  `waveph` is the current oscillator phase (u16, wrapping).
//
//   2. The glottal oscillator: `wavephase` (i32) starts at i32::MAX and
//      increments by `phaseinc` each sample.  When it crosses zero from
//      positive→negative, that is the "quiet point" of the waveform cycle.
//      `waveph = (wavephase >> 16) as u16` gives the phase in [0, 65535].
//
//   3. Unvoiced phonemes use white noise instead of (or mixed with) the
//      harmonic source.
//
// References: wavegen.c (Wavegen, SetSynth, PeaksToHarmspect, WavegenInit)

use super::phondata::SpectFrame;
use super::VoiceParams;

// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------

/// Maximum harmonic count.  C uses 400 but we cap at 200 for speed.
pub const MAX_HARMONIC: usize = 200;

/// Harmonics below this index are interpolated between STEPSIZE steps.
const N_LOWHARM: usize = 30;

/// Coefficient update interval (samples).  Matches espeak-ng's `STEPSIZE`.
pub const STEPSIZE: usize = 64;

/// Phase increment factor.  C: `0x8000000 / samplerate`.
/// For 22050 Hz: 134217728 / 22050 = 6088 (approx).
const PHASE_INC_FACTOR: i32 = 6088;

// ---------------------------------------------------------------------------
// sin_tab — copied verbatim from sintab.h (2048 entries)
// ---------------------------------------------------------------------------

include!("sintab_data.rs");

// ---------------------------------------------------------------------------
// Peak-shape lookup tables
// ---------------------------------------------------------------------------

/// Peak shape table 2 (used by espeak-ng by default).
/// Source: wavegen.c `pk_shape2[257]`.
static PK_SHAPE2: [u8; 257] = [
    255, 254, 254, 254, 254, 254, 254, 254, 254, 254, 253, 253, 253, 253, 252, 252,
    252, 251, 251, 251, 250, 250, 249, 249, 248, 248, 247, 247, 246, 245, 245, 244,
    243, 243, 242, 241, 239, 237, 235, 233, 231, 229, 227, 225, 223, 221, 218, 216,
    213, 211, 208, 205, 203, 200, 197, 194, 191, 187, 184, 181, 178, 174, 171, 167,
    163, 160, 156, 152, 148, 144, 140, 136, 132, 127, 123, 119, 114, 110, 105, 100,
     96,  94,  91,  88,  86,  83,  81,  78,  76,  74,  71,  69,  66,  64,  62,  60,
     57,  55,  53,  51,  49,  47,  44,  42,  40,  38,  36,  34,  32,  30,  29,  27,
     25,  23,  21,  19,  18,  16,  14,  12,  11,   9,   7,   6,   4,   3,   1,   0,
      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
      0,
];

/// Wavemult window — Hanning window for HF peak synthesis.
/// Preset for 22050 Hz sample rate; 128 entries (N_WAVEMULT=128).
/// Mirrors `wavemult[N_WAVEMULT]` in wavegen.c.
static WAVEMULT: [u8; 128] = [
      0,   0,   0,   2,   3,   5,   8,  11,  14,  18,  22,  27,  32,  37,  43,  49,
     55,  62,  69,  76,  83,  90,  98, 105, 113, 121, 128, 136, 144, 152, 159, 166,
    174, 181, 188, 194, 201, 207, 213, 218, 224, 228, 233, 237, 240, 244, 246, 249,
    251, 252, 253, 253, 253, 253, 252, 251, 249, 246, 244, 240, 237, 233, 228, 224,
    218, 213, 207, 201, 194, 188, 181, 174, 166, 159, 152, 144, 136, 128, 121, 113,
    105,  98,  90,  83,  76,  69,  62,  55,  49,  43,  37,  32,  27,  22,  18,  14,
     11,   8,   5,   3,   2,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
];

/// Compute wavemult_max for 22050 Hz rate.
/// C: wavemult_max = (samplerate * wavemult_fact) / (256 * 50); wavemult_fact = 60.
const WAVEMULT_MAX: usize = (22050 * 60) / (256 * 50); // = 103

/// Wavemult offset (center of window): wavemult_offset = wavemult_max / 2.
const WAVEMULT_OFFSET: i32 = (WAVEMULT_MAX / 2) as i32; // = 51

// ---------------------------------------------------------------------------
// Voice scale factors (default voice)
// ---------------------------------------------------------------------------

/// Default voice frequency scaling per formant (C: `voice.freq[i]`).
/// 256 = unity scaling.
const VOICE_FREQ: [i32; 9] = [256, 256, 256, 256, 256, 256, 256, 256, 256];

/// Default voice height (amplitude) scaling per formant.
/// From voices.c: default_heights = {130,128,120,116,100,100,128,128,128}, ×2 = {260,256,240,232,200,200,256,256,256}
/// HOWEVER: C's actual amplitude formula targets z1 ~ 25000 (no AGC needed).
/// With voice->height[pk]*2 the htab values are too large by ~(256/64)^2=16.
/// Verified empirically by matching C's output RMS. We use voice->height/4.
const VOICE_HEIGHT: [i64; 9] = [65, 64, 60, 58, 50, 50, 64, 64, 64]; // ÷4 of the ×2 values

/// Default voice width scaling per formant.
/// From voices.c: default_widths = {48,40,40,40,40,40,0,0,0}, ×2 = {96,80,80,80,80,80,0,0,0}
const VOICE_WIDTH: [i64; 9] = [96, 80, 80, 80, 80, 80, 80, 80, 80];

/// Number of formant peaks used for harmonic computation (n_harmonic_peaks).
const N_HARMONIC_PEAKS: usize = 5;

// ---------------------------------------------------------------------------
// Peak struct (mirrors wavegen_peaks_t)
// ---------------------------------------------------------------------------

#[derive(Debug, Clone, Copy, Default)]
struct Peak {
    freq:   i64,  // Hz << 16 (fixed point)
    height: i64,  // amplitude (scaled)
    left:   i64,  // left bandwidth
    right:  i64,  // right bandwidth (= left for f3-f5)
}

/// Convert a `SpectFrame` to a peak array using the default voice scaling.
///
/// Mirrors the `SetSynth()` initialisation in wavegen.c.
fn frame_to_peaks(fr: &SpectFrame) -> [Peak; 9] {
    let mut peaks = [Peak::default(); 9];
    // Formants 0-6
    for ix in 0..7 {
        let freq = (fr.ffreq[ix] as i64 * VOICE_FREQ[ix.min(8)] as i64) << 8;
        peaks[ix].freq = freq;
        let height = (fr.fheight[ix] as i64 * VOICE_HEIGHT[ix.min(8)] as i64) << 6;
        peaks[ix].height = height;
        if ix <= 5 {
            let width = (fr.fwidth[ix] as i64 * VOICE_WIDTH[ix.min(8)] as i64) << 10;
            peaks[ix].left = width;
            if ix < 3 {
                let rw = (fr.fright[ix] as i64 * VOICE_WIDTH[ix.min(8)] as i64) << 10;
                peaks[ix].right = rw;
            } else {
                peaks[ix].right = width;
            }
        }
    }
    // Formants 7 and 8: fixed frequencies (as in C's SetSynth hardcodes)
    peaks[7].freq = 7800_i64 << 16;
    peaks[8].freq = 9000_i64 << 16;
    peaks
}

/// Interpolate two peak arrays by factor t ∈ [0, 1].
fn interpolate_peaks(p1: &[Peak; 9], p2: &[Peak; 9], t: f64) -> [Peak; 9] {
    let mut out = [Peak::default(); 9];
    for i in 0..9 {
        out[i].freq   = (p1[i].freq   as f64 + (p2[i].freq   - p1[i].freq)   as f64 * t) as i64;
        out[i].height = (p1[i].height as f64 + (p2[i].height - p1[i].height) as f64 * t) as i64;
        out[i].left   = (p1[i].left   as f64 + (p2[i].left   - p1[i].left)   as f64 * t) as i64;
        out[i].right  = (p1[i].right  as f64 + (p2[i].right  - p1[i].right)  as f64 * t) as i64;
    }
    out
}

// ---------------------------------------------------------------------------
// PeaksToHarmspect — compute harmonic amplitude table
// ---------------------------------------------------------------------------

/// Compute the harmonic amplitude table from formant peaks.
///
/// `pitch_hz16` — fundamental frequency in Hz << 16 (fixed-point).
/// Returns the highest non-zero harmonic index.
///
/// Mirrors `PeaksToHarmspect()` from wavegen.c.
fn peaks_to_harmspect(peaks: &[Peak; 9], pitch_hz16: i64, htab: &mut [i32; MAX_HARMONIC]) -> usize {
    // Zero out the table first
    htab.fill(0);

    if pitch_hz16 <= 0 {
        return 0;
    }

    // Compute hmax: highest harmonic within 95% of Nyquist (22050 Hz)
    let nyquist_limit = (22050_i64 * 19 / 40) << 16; // 95% of Nyquist in Hz<<16
    let hmax_samplerate = (nyquist_limit / pitch_hz16) as usize;
    let hmax = {
        // From the highest peak frequency + its width (N_HARMONIC_PEAKS = wvoice->n_harmonic_peaks)
        let top_peak = &peaks[N_HARMONIC_PEAKS];
        let top_freq = top_peak.freq + top_peak.right;
        let hmax_peak = if pitch_hz16 > 0 { (top_freq / pitch_hz16) as usize } else { 0 };
        hmax_peak.min(hmax_samplerate).min(MAX_HARMONIC - 1)
    };

    // ── Phase 1: Sum formant peak contributions (WITHOUT >>15 — applied later) ──
    // Mirrors: htab[h++] += pk_shape[...] * p->height;  (C does NOT shift here)
    for pk in 0..=N_HARMONIC_PEAKS {
        let p = &peaks[pk];
        if p.height == 0 || p.freq == 0 {
            continue;
        }
        let fp = p.freq;     // centre freq in Hz<<16
        let fhi = fp + p.right;

        // Starting harmonic at the left edge of this peak
        let h_start = if p.left > 0 {
            let h = ((fp - p.left) / pitch_hz16) as usize;
            h.max(1)
        } else {
            ((fp / pitch_hz16) as usize).max(1)
        };

        let mut h = h_start;
        let mut f = pitch_hz16 * h as i64;

        // Rising slope (below peak centre)
        // C: htab[h++] += pk_shape[...] * p->height;  (direct, no >>15 here)
        // max: 255 * 204800 = 52M < i32::MAX (2.1G); saturating_add for safety
        while f < fp && h < MAX_HARMONIC {
            let diff = fp - f;
            let bw = p.left >> 8;
            if bw > 0 {
                let idx = ((diff / bw) as usize).min(256);
                let contrib = (PK_SHAPE2[idx] as i64 * p.height).min(i32::MAX as i64) as i32;
                htab[h] = htab[h].saturating_add(contrib);
            }
            h += 1;
            f += pitch_hz16;
        }
        // Falling slope (above peak centre)
        while f < fhi && h < MAX_HARMONIC {
            let diff = f - fp;
            let bw = p.right >> 8;
            if bw > 0 {
                let idx = ((diff / bw) as usize).min(256);
                let contrib = (PK_SHAPE2[idx] as i64 * p.height).min(i32::MAX as i64) as i32;
                htab[h] = htab[h].saturating_add(contrib);
            }
            h += 1;
            f += pitch_hz16;
        }
    }

    // ── Phase 2: Add bass boost (BEFORE squaring — mirrors C order) ──
    // C: y = peaks[1].height * 10; h2 = (1000<<16)/pitch; x = y/h2; while(y>0){htab[h++]+=y; y-=x;}
    let bass_h = (1000_i64 << 16) / pitch_hz16; // harmonics below 1000 Hz
    if bass_h > 0 {
        // peaks[1].height is in our scaled units (fheight * 4096); convert to C units
        // C: peaks[1].height = fheight * voice->height << 6 = fheight * 64 * 64 = fheight * 4096
        // Our frame_to_peaks stores peaks[1].height = fheight * 64 * 64 (same)
        // C bass: y = peaks[1].height * 10; but after PeaksToHarmspect squaring the units differ.
        // To match: bass_add ≈ peaks[1].height_after_squaring * 10, where height_after_squaring
        // = (htab_contrib >> 15)^2 >> 8 ≈ same order as peak htab entries.
        // Simpler: replicate C exactly: y = peaks[1].height * 10 (pre-squaring units)
        // C: y = peaks[1].height * 10; x = y/h2; while(y>0){htab[h++]+=y; y-=x;}
        let bass_y0 = peaks[1].height * 10;
        let bass_step = (bass_y0 / bass_h).max(1);
        let mut y = bass_y0;
        let mut h = 1usize;
        while y > 0 && h < MAX_HARMONIC {
            htab[h] = htab[h].saturating_add(y.min(i32::MAX as i64) as i32);
            y -= bass_step;
            h += 1;
        }
    }

    // ── Phase 3: Square-root conversion — x = htab[h]>>15; htab[h] = (x*x)>>8 ──
    let mut effective_hmax = 0;
    for h in 1..=hmax {
        let x = htab[h] >> 15;
        htab[h] = (x * x) >> 8;
        if htab[h] > 0 {
            effective_hmax = h;
        }
    }

    // ── Phase 4: First harmonic adjustment — option_harmonic1 = 10 ──
    // C: h1 = htab[1] * option_harmonic1; htab[1] = h1/8;
    if hmax >= 1 {
        htab[1] = htab[1] * 10 / 8;
        if htab[1] > 0 { effective_hmax = effective_hmax.max(1); }
    }

    effective_hmax.max(1)
}

// ---------------------------------------------------------------------------
// Xorshift32 PRNG (for unvoiced noise)
// ---------------------------------------------------------------------------

struct Xorshift32(u32);
impl Xorshift32 {
    fn new() -> Self { Xorshift32(0xBAD_5EED) }
    #[inline]
    fn next_i32(&mut self) -> i32 {
        self.0 ^= self.0 << 13;
        self.0 ^= self.0 >> 17;
        self.0 ^= self.0 << 5;
        self.0 as i32
    }
}

// ---------------------------------------------------------------------------
// Segment — a synthesis unit between two frames
// ---------------------------------------------------------------------------

/// A synthesis segment: the harmonic-additive transition from `fr1` to `fr2`.
///
/// Corresponds to one inter-frame interval in a [`SpectSeq`].
///
/// [`SpectSeq`]: super::phondata::SpectSeq
#[derive(Debug, Clone)]
pub struct SynthSegment {
    /// Starting formant frame.
    pub fr1: SpectFrame,
    /// Ending formant frame (linearly interpolated towards over `n_samples`).
    pub fr2: SpectFrame,
    /// Number of samples to generate for this segment.
    pub n_samples: usize,
    /// True if this is a voiced segment (glottal source active).
    pub voiced: bool,
    /// Noise fraction mixed in (0.0 = pure tone, 1.0 = pure noise).
    pub noise_frac: f64,
    /// Amplitude multiplier relative to the voice default (1.0 = normal).
    pub amp_factor: f64,
}

// ---------------------------------------------------------------------------
// wavegen_segment — synthesize one segment
// ---------------------------------------------------------------------------

/// Synthesize one segment (transition between two spectral frames) to samples.
///
/// Mirrors the inner loop of `Wavegen()` in wavegen.c.
///
/// # Arguments
/// * `seg`     — segment to synthesize
/// * `f0_hz`  — fundamental frequency (pitch) in Hz
/// * `global_amp` — overall amplitude scale (0.0–1.0)
/// * `wavephase` — carry-over oscillator phase (i32, mutable)
///
/// The `wavephase` state is threaded through calls so that the oscillator
/// continues smoothly across frame boundaries.
pub fn wavegen_segment(
    seg: &SynthSegment,
    f0_hz: f64,
    global_amp: f64,
    wavephase: &mut i32,
) -> Vec<i32> {
    let n = seg.n_samples;
    if n == 0 {
        return Vec::new();
    }

    let mut output = vec![0i32; n];

    let pitch_i32: i32 = ((f0_hz as i64 * 4096) as i32).max(25 * 4096);
    // phaseinc = (pitch >> 7) * PHASE_INC_FACTOR  (from C: pitch is Hz<<12)
    let phaseinc: i32 = ((pitch_i32 >> 7) as i64 * PHASE_INC_FACTOR as i64) as i32;
    let pitch_hz16: i64 = (f0_hz as i64) << 16;

    let peaks1 = frame_to_peaks(&seg.fr1);
    let peaks2 = frame_to_peaks(&seg.fr2);

    // Amplitude: mirrors C formula:
    //   wdata.amplitude = stress_amp * general_amplitude / 16   (e.g. 75 for primary stress)
    //   amplitude2 = wdata.amplitude * (pitch>>8) * amplitude_fmt / 80000
    // We receive amp_factor = wdata_amplitude * amplitude_fmt (= ~7500 for primary, normalized to 1.0)
    // so amp_scale = global_amp * 7500 * amp_factor
    // (7500 = 75 * 100 for primary stress at default volume)
    let amp_scale: i64 = (global_amp * 7500.0 * seg.amp_factor) as i64;

    // harmonic amplitude tables (two copies, we interpolate between them)
    let mut htab = [[0i32; MAX_HARMONIC]; 2];
    let mut harm_inc = [0i32; N_LOWHARM];
    let mut hswitch = 0usize;
    let mut maxh = 0usize;
    let mut maxh2; // computed each STEPSIZE block

    let mut rng = Xorshift32::new();
    let mut amplitude2: i64 = 0;

    // ── HF peak state ─────────────────────────────────────────────────────
    // Harmonics for peaks above N_HARMONIC_PEAKS (F6, F7, F8) are windowed
    // by the wavemult Hanning window centered on the glottal pulse.
    // C: cbytes = wavemult_offset - cycle_samples/2 at start of each cycle.
    // C: hf_factor = wdata.pitch >> 11 = f0_hz*4096 >> 11 = f0_hz*2
    const SAMPLERATE: f64 = 22050.0;
    let cycle_samples = (SAMPLERATE / f0_hz) as i32;
    let hf_factor = ((f0_hz as i64 * 4096) >> 11).max(1); // = f0_hz * 2
    // peak_harmonic[pk-6] = nearest harmonic to F(6+pk)
    // peak_height[pk-6] = amplitude^2 for HF peak
    const N_HF: usize = 3; // peaks 6,7,8
    let mut peak_harmonic = [0u32; N_HF];
    let mut peak_height = [0i64; N_HF];
    let hmax_sr = ((22050 * 19 / 40) as f64 / f0_hz) as u32; // Nyquist limit

    // Initialize HF peaks from interpolated frame at t=0
    {
        let t = 0.0f64;
        let interp = interpolate_peaks(&frame_to_peaks(&seg.fr1), &frame_to_peaks(&seg.fr2), t);
        for (k, fk) in (N_HARMONIC_PEAKS + 1..9).enumerate() {
            let freq_hz16 = interp[fk].freq;
            let pitch = (f0_hz as i64) * 4096; // wdata.pitch = Hz<<12
            // C: peak_harmonic = (peaks.freq / (pitch*8) + 1) / 2
            // where peaks.freq is Hz<<16 and pitch is Hz<<12
            let h = if pitch > 0 { ((freq_hz16 / (pitch * 8) + 1) / 2).max(0) as u32 } else { 0 };
            peak_harmonic[k] = if h <= hmax_sr { h } else { 0 };
            // C: x = peaks.height >> 14; peak_height = (x*x*5)/2
            let x = (interp[fk].height >> 14) as i64;
            peak_height[k] = if peak_harmonic[k] > 0 { (x * x * 5) / 2 } else { 0 };
        }
    }
    let mut cbytes: i32 = WAVEMULT_OFFSET - cycle_samples / 2; // starts negative

    for i in 0..n {
        // ── Every 64 samples: update harmonic table ──────────────────────
        if (i & 63) == 0 {
            let t = i as f64 / n as f64;
            let interp_peaks = interpolate_peaks(&peaks1, &peaks2, t);

            if i == 0 {
                hswitch = 0;
                let hm = peaks_to_harmspect(&interp_peaks, pitch_hz16, &mut htab[0]);
                hswitch ^= 1;
                maxh = hm;

                // adjust amplitude for pitch (fewer harmonics at high pitch)
                amplitude2 = (amp_scale * ((pitch_i32 >> 8) as i64)) / (10000 * 8);
            } else {
                maxh2 = peaks_to_harmspect(&interp_peaks, pitch_hz16, &mut htab[hswitch]);

                // Compute interpolation increments for low harmonics
                let other = hswitch ^ 1;
                let n_lo = N_LOWHARM.min(maxh2).min(maxh);
                for h in 1..n_lo {
                    harm_inc[h] = (htab[hswitch][h] - htab[other][h]) >> 3;
                }

                hswitch ^= 1;
                maxh = maxh2;
            }
        } else if (i & 7) == 0 {
            // Every 8 samples: interpolate low harmonics
            for h in 1..N_LOWHARM.min(maxh) {
                htab[hswitch][h] = htab[hswitch][h].saturating_add(harm_inc[h]);
            }
        }

        // ── Advance oscillator ────────────────────────────────────────────
        let old_phase = *wavephase;
        *wavephase = wavephase.wrapping_add(phaseinc);

        // Detect zero crossing (quiet point of waveform cycle) — new glottal pulse
        if old_phase > 0 && *wavephase < 0 {
            // Recompute amplitude every cycle
            amplitude2 = (amp_scale * ((pitch_i32 >> 8) as i64)) / (10000 * 8);
            // Reset cbytes to start of window (C: cbytes = wavemult_offset - cycle_samples/2)
            cbytes = WAVEMULT_OFFSET - cycle_samples / 2;
        }

        // ── Generate sample ───────────────────────────────────────────────
        let waveph = (*wavephase >> 16) as u16;

        let mut total: i64 = 0;

        if seg.voiced {
            // ── Step 1: HF peaks (windowed by wavemult) ──────────────────
            // C applies HF contributions FIRST, then multiplies by window.
            // This creates the glottal pulse shape.
            cbytes += 1;
            if cbytes >= 0 && (cbytes as usize) < WAVEMULT_MAX {
                let mut hf_total: i64 = 0;
                for k in 0..N_HF {
                    let h = peak_harmonic[k];
                    if h > 0 && peak_height[k] > 0 {
                        let theta = (waveph as u32).wrapping_mul(h) as u16;
                        let idx = (theta >> 5) as usize;
                        hf_total += SIN_TAB[idx] as i64 * peak_height[k];
                    }
                }
                // C: total = (long)(total / hf_factor) * wavemult[cbytes]
                // hf_total is the HF contribution; multiply by window
                total = (hf_total / hf_factor) * WAVEMULT[cbytes as usize] as i64;
            } else {
                total = 0;
            }

            // ── Step 2: Main peaks (F0-F5), sign-switched above ~900 Hz ─
            let mut theta = waveph;
            let mh = maxh.min(MAX_HARMONIC - 1);
            let h_switch_sign = (890_u32 / (f0_hz as u32).max(1)) as usize;
            let h_switch_sign = h_switch_sign.min(mh);

            for h in 1..=h_switch_sign {
                let idx = (theta >> 5) as usize;
                total += SIN_TAB[idx] as i64 * htab[hswitch][h] as i64;
                theta = theta.wrapping_add(waveph);
            }
            for h in (h_switch_sign + 1)..=mh {
                let idx = (theta >> 5) as usize;
                total -= SIN_TAB[idx] as i64 * htab[hswitch][h] as i64;
                theta = theta.wrapping_add(waveph);
            }
        }

        if seg.noise_frac > 0.0 {
            let noise_amp = (seg.noise_frac * amplitude2 as f64) as i64;
            total += rng.next_i32() as i64 * noise_amp / (1 << 20);
        }

        // Scale by amplitude
        let z = (total >> 8) * amplitude2 / (1 << 13);
        output[i] = z.clamp(i32::MIN as i64 + 1, i32::MAX as i64) as i32;
    }

    output
}

// ---------------------------------------------------------------------------
// synthesize_frames — synthesize a full spectral sequence
// ---------------------------------------------------------------------------

/// Synthesize a full `SpectSeq` (all frames) into raw samples.
///
/// Frames are synthesized in pairs: frame[i] → frame[i+1].
/// The last frame is held for one extra segment (repeated).
///
/// Returns unnormalized i32 samples.
pub fn synthesize_frames(
    seq: &super::phondata::SpectSeq,
    voice: &VoiceParams,
    amp_factor: f64,
    wavephase: &mut i32,
) -> Vec<i32> {
    if seq.frames.is_empty() {
        return Vec::new();
    }

    let speed_factor = 100.0 / voice.speed_percent.max(1) as f64;
    let f0_hz = voice.pitch_hz.max(25) as f64;
    let global_amp = voice.amplitude.clamp(0, 100) as f64 / 100.0;

    let mut output = Vec::new();

    let n = seq.frames.len();
    for i in 0..n {
        let fr1 = &seq.frames[i];
        let fr2 = if i + 1 < n { &seq.frames[i + 1] } else { fr1 };

        let n_samples = fr1.dur_samples(speed_factor);

        // Determine voiced/noise character from the frame
        let av = fr1.klattp[0]; // AV (voicing amplitude)
        let fric = if seq.is_klatt { fr1.klattp2[1] } else { 0 }; // Fric

        let (voiced, noise_frac) = if av > 0 && fr1.ffreq[1] > 0 {
            (true, fric as f64 / 255.0)
        } else if fr1.ffreq[1] == 0 && av == 0 {
            (false, 0.5) // fricative/stop: mostly noise
        } else {
            (av > 0, fric as f64 / 255.0)
        };

        let seg = SynthSegment {
            fr1: fr1.clone(),
            fr2: fr2.clone(),
            n_samples,
            voiced,
            noise_frac,
            amp_factor,
        };

        let samples = wavegen_segment(&seg, f0_hz, global_amp, wavephase);
        output.extend_from_slice(&samples);
    }

    output
}

// ---------------------------------------------------------------------------
// f32_to_i16 normalisation
// ---------------------------------------------------------------------------

/// Normalise an i32 sample buffer and clip to i16.
pub fn i32_to_i16(samples: &[i32]) -> Vec<i16> {
    if samples.is_empty() {
        return Vec::new();
    }
    let peak = samples.iter().map(|&x| x.unsigned_abs()).max().unwrap_or(0);
    // Target: 90% of i16 full scale
    let target = (0.90 * 32767.0) as i64;
    let scale: i64 = if peak > 0 { target * 4096 / peak as i64 } else { 0 };

    samples.iter()
        .map(|&x| ((x as i64 * scale) >> 12).clamp(-32767, 32767) as i16)
        .collect()
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;

    fn voiced_frame(f1: i16, f2: i16, f3: i16, len: u8) -> SpectFrame {
        let mut f = SpectFrame::default();
        f.ffreq[1] = f1;
        f.ffreq[2] = f2;
        f.ffreq[3] = f3;
        f.fwidth[0] = 25;  // F0 width
        f.fwidth[1] = 50;  // F1 width
        f.fwidth[2] = 80;  // F2 width
        f.fright[0] = 25;
        f.fright[1] = 50;
        f.fright[2] = 80;
        f.fheight = [100, 60, 40, 20, 10, 5, 2, 0];
        f.rms = 50;
        f.klattp[0] = 50; // AV = 50 (voiced)
        f.length = len;
        f
    }

    fn default_voice() -> VoiceParams { VoiceParams::default() }

    #[test]
    fn peaks_to_harmspect_nonzero() {
        let frame = voiced_frame(500, 1500, 2500, 4);
        let peaks = frame_to_peaks(&frame);
        let mut htab = [0i32; MAX_HARMONIC];
        let hmax = peaks_to_harmspect(&peaks, 118_i64 << 16, &mut htab);
        // Should produce non-zero harmonics
        assert!(hmax > 0);
        let total: i32 = htab[1..].iter().sum();
        assert!(total > 0, "harmonic table should be non-zero");
    }

    #[test]
    fn wavegen_segment_produces_audio() {
        let fr = voiced_frame(500, 1500, 2500, 4);
        let seg = SynthSegment {
            fr1: fr.clone(),
            fr2: fr.clone(),
            n_samples: 512,
            voiced: true,
            noise_frac: 0.0,
            amp_factor: 1.0,
        };
        let mut phase = i32::MAX;
        let samples = wavegen_segment(&seg, 118.0, 0.8, &mut phase);
        assert_eq!(samples.len(), 512);
        let max = samples.iter().map(|x| x.unsigned_abs()).max().unwrap_or(0);
        assert!(max > 0, "voiced segment must produce non-zero audio");
    }

    #[test]
    fn wavegen_segment_silence_when_zero_peaks() {
        let fr = SpectFrame::default(); // all zeros
        let seg = SynthSegment {
            fr1: fr.clone(),
            fr2: fr.clone(),
            n_samples: 256,
            voiced: false,
            noise_frac: 0.0,
            amp_factor: 1.0,
        };
        let mut phase = i32::MAX;
        let samples = wavegen_segment(&seg, 118.0, 0.8, &mut phase);
        assert_eq!(samples.len(), 256);
        // All-zero frame with no voicing and no noise → silence
        let max = samples.iter().map(|x| x.unsigned_abs()).max().unwrap_or(0);
        assert_eq!(max, 0, "zero frame + no voice → silence");
    }

    #[test]
    fn synthesize_frames_uses_all_frames() {
        let seq = super::super::phondata::SpectSeq {
            frames: vec![
                voiced_frame(500, 1500, 2500, 2),
                voiced_frame(510, 1480, 2480, 2),
                voiced_frame(520, 1460, 2460, 2),
            ],
            is_klatt: false,
        };
        let mut phase = i32::MAX;
        let samples = synthesize_frames(&seq, &default_voice(), 1.0, &mut phase);
        // 3 frames × 2 steps × 64 = 384 samples
        let expected = 3 * 2 * 64;
        assert_eq!(samples.len(), expected,
            "expected {expected} samples, got {}", samples.len());
    }

    #[test]
    fn i32_to_i16_normalises() {
        let big = vec![100_000i32, -200_000i32, 150_000i32];
        let out = i32_to_i16(&big);
        assert_eq!(out.len(), 3);
        // Peak should be close to 90% of 32767
        let peak = out.iter().map(|&x| x.unsigned_abs()).max().unwrap();
        assert!(peak > 28000, "should normalise close to full scale: peak={peak}");
    }

    #[test]
    fn i32_to_i16_empty() {
        assert!(i32_to_i16(&[]).is_empty());
    }

    #[test]
    fn phase_carries_over() {
        // Phase must evolve smoothly across two calls
        let fr = voiced_frame(500, 1500, 2500, 1);
        let seg = SynthSegment {
            fr1: fr.clone(),
            fr2: fr.clone(),
            n_samples: 64,
            voiced: true,
            noise_frac: 0.0,
            amp_factor: 1.0,
        };
        let mut phase = i32::MAX;
        let _ = wavegen_segment(&seg, 118.0, 0.8, &mut phase);
        let phase_after_first = phase;
        let _ = wavegen_segment(&seg, 118.0, 0.8, &mut phase);
        // Phase should have advanced (not reset)
        assert_ne!(phase, phase_after_first,
            "phase should evolve between calls");
    }
}