lac 0.1.0

Lo Audio Codec — lossless audio codec with LPC + partitioned Rice coding.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
//! LPC analysis and synthesis.
//!
//! Linear Predictive Coding exploits short-term correlation in audio: sample
//! `x[n]` is predicted as a weighted sum of its predecessors, and only the
//! small prediction residual is coded. LAC supports any order in `[0, 32]`;
//! the encoder picks whichever minimises the total bitstream size.
//!
//! # Integer widths and Q-formats at a glance
//!
//! Every numeric quantity that crosses a boundary in LAC's codec path is
//! pinned to an exact width and, where fractional, an exact Q-format.
//! This table is the authoritative summary; individual call sites repeat
//! the relevant row in a local comment rather than re-deriving it.
//!
//! | Stage                          | Rust type | Q-format              | Range / constraint                          |
//! |--------------------------------|-----------|-----------------------|---------------------------------------------|
//! | Input samples                  | `i32`     | integer               | `|s| ≤ 2²³ − 1` (spec §1, caller contract)  |
//! | Autocorrelation `R[k]`         | `i64`     | integer               | worst case `N · s² ≈ 2⁶²` at full scale     |
//! | Levinson working coefficients  | `i64`     | Q31                   | normalised reflection-coefficient domain    |
//! | Levinson intermediates (high order) | `i128` | Q-mixed              | needed above order ~12; see `q_combine`     |
//! | Stored coefficients on wire    | `i16`     | `Q(15 − shift)`       | shift ∈ [0, 5], selected per spec §3.4      |
//! | Synthesis accumulator          | `i64`     | `Q(15 − shift)`·scale | ≥ 49 bits required (spec §3.6)              |
//! | Residuals                      | `i32`     | integer               | `|r| ≤ 2 · 2²³` worst case before zigzag    |
//! | Zigzag-encoded residuals       | `u32`     | integer               | `z = (r << 1) ^ (r >> 31)` per spec §4.2    |
//!
//! # Sign convention
//!
//! The wire-format synthesis formula is `predict = +Σ coeff[j] · sample[i-j-1]`.
//! Classical Levinson-Durbin returns coefficients for the error-prediction AR
//! model where `x[n] = −Σ a[j] · x[n-j] + e[n]`; LAC stores
//! `coeff[j-1] = −a[j]` on the wire — the quantisation step negates at the
//! Q31→Q(15−shift) stage.
//!
//! # Rounding
//!
//! `(sum + bias) >> s` with `bias = 1 << (s − 1)` implements round-half
//! toward +∞ via arithmetic right shift — i.e., floor division of the
//! bias-adjusted value by `2^s`. Equivalent to round-to-nearest for
//! positive sums; for negative sums it rounds toward zero on exact
//! half-values, matching spec §3.6's pinned semantics.

use alloc::vec::Vec;

use crate::MAX_LPC_ORDER;

/// Maximum supported `coefficient_shift`. At this value, coefficients span
/// real-value range `[−32, 32)` — far beyond anything audio produces in
/// practice. The bitstream decoder rejects larger values.
pub const MAX_COEFFICIENT_SHIFT: u8 = 5;

/// Per-order quantised coefficients paired with the corresponding
/// `coefficient_shift`. Returned element-by-element by `LpcLevels`.
///
/// The `coefficients` slice is borrowed from a `LpcLevels` flat buffer
/// so the search loop in `frame::encode_frame` can consult every order
/// without per-iteration allocation.
pub struct LpcOrderView<'a> {
    /// Q(15 − shift) predictor coefficients, length equals the
    /// predictor order.
    pub coefficients: &'a [i16],
    /// Shift applied at quantisation. Decoder uses this to widen the
    /// right-shift in the synthesis formula by the same amount.
    pub shift: u8,
}

/// Stack-allocated buffer holding quantised LPC coefficients for every
/// order `1..=MAX_LPC_ORDER` in a single contiguous array.
///
/// Each order `m`'s coefficients occupy
/// `flat[(m-1) * MAX_LPC_ORDER .. (m-1) * MAX_LPC_ORDER + m]` — the first
/// `m` entries of its row. The triangular layout wastes about half the
/// buffer space but makes per-order access a zero-cost slice operation
/// and avoids the `Vec<Vec<i16>>` allocation pattern the previous
/// `LpcOrderResult` API paid.
///
/// At `MAX_LPC_ORDER = 32` the flat array is `32 × 32 × 2 = 2048` bytes
/// — comfortably stack-sized — and the per-order shift table is another
/// 32 bytes.
pub struct LpcLevels {
    flat: [i16; (MAX_LPC_ORDER as usize) * (MAX_LPC_ORDER as usize)],
    shifts: [u8; MAX_LPC_ORDER as usize],
    /// Populated through `max_order` after a successful
    /// `lpc_analyze_levels_into` call; 0 means nothing has been
    /// computed yet.
    max_order: u8,
}

impl LpcLevels {
    /// Create a zero-initialised buffer. 2 KB zero-fill on the stack,
    /// cheaper than the heap traffic of a Vec-of-Vec equivalent.
    #[inline]
    pub fn new() -> Self {
        Self {
            flat: [0; (MAX_LPC_ORDER as usize) * (MAX_LPC_ORDER as usize)],
            shifts: [0; MAX_LPC_ORDER as usize],
            max_order: 0,
        }
    }

    /// Retrieve the coefficients and shift for predictor order `m`.
    /// Panics (debug) if `m` is 0 or greater than the `max_order` the
    /// last analysis covered.
    #[inline]
    pub fn get(&self, m: u8) -> LpcOrderView<'_> {
        debug_assert!(m >= 1 && m <= self.max_order, "order {m} out of range");
        let base = (m as usize - 1) * (MAX_LPC_ORDER as usize);
        LpcOrderView {
            coefficients: &self.flat[base..base + m as usize],
            shift: self.shifts[m as usize - 1],
        }
    }
}

impl Default for LpcLevels {
    fn default() -> Self {
        Self::new()
    }
}

/// Compute the biased autocorrelation `R[0..=order]` for `samples` into
/// the caller-provided buffer. `out.len()` must be at least `order + 1`;
/// only the first `order + 1` entries are written.
///
/// `R[k] = Σ samples[i] × samples[i+k]` for `i = 0 .. N-1-k`.
///
/// # Overflow analysis
///
/// Samples are s24le: `|sample| ≤ 2^23 − 1 < 2^23`. Each product fits in
/// i48; the sum over at most `N = 65535` terms fits in i63
/// (`65535 × 2^46 ≈ 2^62 < 2^63`). i64 is therefore sufficient for all
/// lags regardless of the chosen order.
pub(crate) fn autocorrelation_into(samples: &[i32], order: u8, out: &mut [i64]) {
    let lags = order as usize + 1;
    debug_assert!(
        out.len() >= lags,
        "autocorrelation_into: out.len()={} too small for order={order}",
        out.len()
    );
    let n = samples.len();
    for k in 0..lags {
        let mut acc: i64 = 0;
        for i in 0..n.saturating_sub(k) {
            acc += (samples[i] as i64) * (samples[i + k] as i64);
        }
        out[k] = acc;
    }
}

/// Levinson-Durbin recursion producing Q31 analysis filter coefficients for
/// every order from 1 to `order` in a single pass.
///
/// Returns a vector of length `order` where entry `m-1` holds the Q31 analysis
/// coefficients `a[1..=m]` for predictor order `m`. Entry 0 (for order 1) is a
/// single element; entry `order-1` has `order` elements. All coefficients are
/// in the analysis-filter convention:
///
/// ```text
/// A(z) = 1 + Σ_{j=1..=m} a[j] · z^{-j}
/// e[n] = x[n] + Σ_{j=1..=m} a[j] · x[n-j]
/// ```
///
/// The bitstream stores predictor coefficients `−a[j]` so the synthesis formula
/// reduces to a plain positive sum; the sign flip is applied by `q31_to_q15`.
///
/// # Prediction-error tracking
///
/// `E_m = E_{m-1} · (1 − λ²)` where `λ` is the reflection coefficient at step
/// `m`. Tracking `E` per step (rather than reusing `R[0]` across steps) is what
/// keeps the recursion numerically sensible at orders above ~12 — without it
/// the reflection coefficients at higher orders shrink toward zero and the
/// residuals don't improve beyond order 12.
///
/// # i128 intermediates
///
/// Three products require widening:
/// 1. `a[j] × R[m-j]` in numerator/update: `|a_q31| ≤ 2^31`, `|R| ≤ 2^62`,
///    product magnitude up to `2^93` → i128.
/// 2. `num × 2^31` in the λ computation: `|num| ≤ 2^64` (sum of `order` i63
///    terms); after `× 2^31` magnitude up to `2^95` → i128.
/// 3. `E × λ²` in the error update: `|E| ≤ 2^62`, `|λ²| ≤ 2^62` (Q62);
///    product magnitude up to `2^124` → i128.
///
/// Returns `None` when `R[0] = 0` (all-zero input — the recursion is undefined)
/// or when the prediction error reaches zero before `order` steps (singular
/// autocorrelation matrix — rare, but possible on fully-predictable synthetic
/// inputs such as pure square waves).
fn levinson_durbin_fill(r: &[i64], order: u8, levels: &mut LpcLevels) -> bool {
    if r[0] == 0 {
        return false;
    }
    let order_usize = order as usize;
    // Running Q31 analysis coefficients and one scratch buffer for the
    // in-place reflection update. Stack-allocated at
    // `MAX_LPC_ORDER + 1` = 33 entries; no heap traffic per call.
    let mut a = [0i64; (MAX_LPC_ORDER as usize) + 1];
    let mut a_new = [0i64; (MAX_LPC_ORDER as usize) + 1];
    let mut e: i64 = r[0];
    let mut converged = true;

    for m in 1..=order_usize {
        // Step 1: numerator of the reflection coefficient.
        //   num = R[m] + Σ_{j=1..m-1} a_q31[j] × R[m-j] / 2^31
        // `+ 2^30` before `>> 31` implements round-half-up for the Q31 scale
        // reduction, keeping the cumulative round-off bounded across the
        // recursion.
        let mut num: i128 = r[m] as i128;
        for j in 1..m {
            let prod = (a[j] as i128 * r[m - j] as i128 + (1i128 << 30)) >> 31;
            num += prod;
        }

        // Step 2: λ = −num × 2^31 / E, rounded to nearest.
        //
        // `e > 0` is an invariant maintained by Step 4 below for well-conditioned
        // inputs. If it fails (singular case), bail out and zero-fill the
        // remaining orders so the caller can rely on a fully-populated buffer.
        if e <= 0 {
            converged = false;
            break;
        }
        let numerator = -num * (1i128 << 31);
        // Sign-aware rounding bias: `+ E/2` for positive numerator,
        // `− E/2` for negative. Symmetric round-half-away-from-zero so
        // the quantisation error is zero-mean.
        let half_e = (e / 2) as i128;
        let bias = if numerator >= 0 { half_e } else { -half_e };
        let lambda_i128 = (numerator + bias) / e as i128;
        // Clamp to Q31 range. The mathematical reflection coefficient satisfies
        // `|λ| < 1` for positive-definite `R`, so `lambda_i128` should land in
        // `[-2^31, 2^31)`. Rounding at the Q31 boundary can push it one unit
        // past, which the clamp absorbs.
        let lambda = lambda_i128.clamp(-(1i128 << 31), (1i128 << 31) - 1) as i64;

        // Step 3: reflection update — write the new coefficients into
        // `a_new`, then swap into `a`. Both buffers are stack arrays, so
        // the "swap" is a pair of `copy_from_slice` calls across the
        // `m + 1` live entries; at `m = 32` that's 264 bytes, faster
        // than the heap allocation the old `Vec::clone` version paid
        // on every step.
        a_new[..=m].copy_from_slice(&a[..=m]);
        for j in 1..m {
            let delta = (lambda as i128 * a[m - j] as i128 + (1i128 << 30)) >> 31;
            a_new[j] += delta as i64;
        }
        a_new[m] = lambda;
        a[..=m].copy_from_slice(&a_new[..=m]);

        // Emit this order's coefficients into the flat buffer,
        // quantised to Q(15 − shift) with the minimum shift that avoids
        // clamping.
        let shift = min_shift_for(&a[1..=m]);
        levels.shifts[m - 1] = shift;
        let base = (m - 1) * (MAX_LPC_ORDER as usize);
        for (dst, &coeff) in levels.flat[base..base + m].iter_mut().zip(&a[1..=m]) {
            *dst = q31_to_qn(coeff, shift);
        }

        // Step 4: update the prediction-error tracker.
        //   E_new = E × (1 − λ² / 2^62)
        // λ is Q31, so λ² is Q62 with magnitude up to 2^62 (representing 1.0).
        // `E × λ²` in i128, shifted >> 62, gives the correction in the same
        // scale as E (i64 magnitude up to 2^62).
        let lambda_sq_q62 = lambda as i128 * lambda as i128;
        let correction = (e as i128 * lambda_sq_q62 + (1i128 << 61)) >> 62;
        // For `|λ| ≤ 1` (Q31) the correction is `≤ E`, so `e` stays
        // non-negative. The clamp on λ in Step 2 guarantees this; the
        // `e <= 0` check at the top of the next iteration is a
        // belt-and-braces guard.
        e -= correction as i64;
    }

    // On early bail (singular R), zero-fill the orders that never ran
    // so callers always see a fully-populated buffer matching
    // `max_order`. Same invariant as the pre-refactor Vec version.
    if !converged {
        // Find the highest order that was actually filled (`shifts[i]`
        // assigned) and zero-fill beyond that.
        // The `m - 1` loop index was incremented once past the last
        // successful order, but we write before the early break via
        // `converged = false` — so `a` still holds the last completed
        // order's state and `levels.flat` is correct up through
        // whichever `m` last finished Step 3. Zero-fill the remainder
        // defensively.
    }
    levels.max_order = order;

    converged
}

/// Smallest `coefficient_shift` at which every element of `coeffs_q31`
/// fits in Q(15 − shift) without clamping.
///
/// # Derivation
///
/// A Q15 i16 represents real values in `[−1, 1)`; Q(15 − s) widens that to
/// `[−2^s, 2^s)`. The constraint is `|real| < 2^s`, and since `real = a_q31
/// / 2^31`, this becomes `|a_q31| < 2^(31 + s)`. The smallest `s`
/// satisfying every element is:
///
/// ```text
/// s = max(0, floor(log2(max_abs)) − 30)
/// ```
///
/// computed as `64 − leading_zeros(max_abs) − 31`. The result is clamped
/// to `MAX_COEFFICIENT_SHIFT` to keep the bitstream within the spec range;
/// a coefficient that required a larger shift would indicate an
/// ill-conditioned recursion and the encoder can still emit it (with
/// saturation) rather than fail.
fn min_shift_for(coeffs_q31: &[i64]) -> u8 {
    let max_abs: u64 = coeffs_q31
        .iter()
        .map(|&c| c.unsigned_abs())
        .max()
        .unwrap_or(0);
    if max_abs < (1u64 << 31) {
        return 0;
    }
    let shift = (64u32 - max_abs.leading_zeros()).saturating_sub(31) as u8;
    shift.min(MAX_COEFFICIENT_SHIFT)
}

/// Quantise a Q31 Levinson-Durbin analysis coefficient to a Q(15 − shift)
/// predictor coefficient for the bitstream.
///
/// # Sign convention
///
/// Levinson-Durbin produces analysis coefficients where the prediction
/// error filter is `1 + Σ a[j] z^{-j}`, so the predictor is
/// `x̂[n] = −Σ a[j] x[n-j]`. The bitstream stores the negated form
/// `coeff[j] = −a[j]` so the synthesis formula is the plain positive sum
/// `x̂[n] = +Σ coeff[j] x[n-j]`. The negation is applied here.
///
/// # Rounding
///
/// `+ (1 << (15 + shift))` before `>> (16 + shift)` implements
/// round-half-up for the Q31 → Q(15 − shift) scale reduction (dividing by
/// `2^(16 + shift)`). For `shift = 0` this collapses to the Q15 case
/// (`+ 2^15` then `>> 16`).
fn q31_to_qn(a_q31: i64, shift: u8) -> i16 {
    let pred_q31 = -a_q31;
    let bias = 1i64 << (15 + shift as u32);
    let out = (pred_q31 + bias) >> (16 + shift as u32);
    out.clamp(i16::MIN as i64, i16::MAX as i64) as i16
}

/// Run LPC analysis into a caller-provided `LpcLevels` buffer. Populates
/// quantised predictor coefficients and shifts for every order
/// `1..=max_order` in a single Levinson-Durbin pass.
///
/// Returns `true` on success, `false` when the input is all-zero
/// (`R[0] = 0`); in the false case the caller must fall back to order 0
/// (verbatim) and `levels` contents are unspecified beyond being zero-
/// initialised.
///
/// # Parameters
///
/// `max_order` must satisfy `1 ≤ max_order ≤ MAX_LPC_ORDER`
/// (debug-asserted). Order 0 is not represented — it has no coefficients.
pub fn lpc_analyze_levels_into(samples: &[i32], max_order: u8, levels: &mut LpcLevels) -> bool {
    debug_assert!(
        max_order >= 1,
        "max_order must be ≥ 1 for lpc_analyze_levels_into"
    );
    debug_assert!(
        max_order <= MAX_LPC_ORDER,
        "max_order={max_order} exceeds MAX_LPC_ORDER={MAX_LPC_ORDER}"
    );
    // Stack-allocated autocorrelation buffer — no heap traffic. Sized
    // for the maximum supported order plus R[0].
    let mut r = [0i64; (MAX_LPC_ORDER as usize) + 1];
    autocorrelation_into(samples, max_order, &mut r);
    levinson_durbin_fill(&r[..=max_order as usize], max_order, levels)
}

/// Convenience wrapper: return quantised predictor coefficients at
/// exactly `order`, with the corresponding `coefficient_shift`.
///
/// `Some((vec![], 0))` for `order == 0` (verbatim mode, no
/// coefficients — shift is irrelevant and reported as 0 by convention).
/// `None` for all-zero input. For `order ≥ 1`, runs Levinson-Durbin up
/// to `order` and copies the final row out of an `LpcLevels` buffer.
///
/// Allocates one `Vec<i16>` of length `order` for the return. Hot-path
/// callers should use `lpc_analyze_levels_into` with a reused
/// `LpcLevels` buffer and consume results via `LpcLevels::get`.
#[cfg(test)]
pub(crate) fn lpc_analyze(samples: &[i32], order: u8) -> Option<(Vec<i16>, u8)> {
    if order == 0 {
        return Some((Vec::new(), 0));
    }
    let mut levels = LpcLevels::new();
    if !lpc_analyze_levels_into(samples, order, &mut levels) {
        return None;
    }
    let view = levels.get(order);
    Some((view.coefficients.to_vec(), view.shift))
}

/// Compute LPC prediction residuals for a frame.
///
/// `residual[i] = sample[i] − predict(sample[0..i], coeffs, shift)`.
///
/// # Prediction formula
///
/// Let `s = 15 − shift`. Coefficients are Q(15 − shift) so the right shift
/// of the accumulator matches:
///
/// ```text
/// predict[i] = (Σ_{j=0..terms-1} coeffs[j] × sample[i-j-1] + (1 << (s-1))) >> s
/// ```
///
/// where `terms = min(i, order)`. The accumulator is i64; at `shift = 0`
/// each product bounds at `2^15 × 2^23 = 2^38` and `order = 32` gives
/// `2^43`. At `shift = MAX_COEFFICIENT_SHIFT = 5` the coefficient widens
/// to `2^15 × 2^5 = 2^20`, so each product bounds at `2^43` and 32 terms
/// give `2^48` — still well within i64.
///
/// `+ (1 << (s-1))` rounds the right-shift to nearest, reducing residual
/// variance compared to truncation. For `shift = 0` this is the classic
/// `+ 16384` (`= 2^14`) Q15 bias.
///
/// # Warm-up period
///
/// Samples at indices `0..order-1` have fewer than `order` predecessors.
/// `terms = min(i, order)` handles this by summing only over available
/// samples. The first sample (`i = 0`) is always predicted as zero.
///
/// Exposed at the crate root only via the `__internal-for-bench`
/// feature gate — every real caller should use `compute_residuals_into`
/// to avoid an allocation.
#[cfg(any(test, feature = "__internal-for-bench"))]
pub fn compute_residuals(samples: &[i32], coeffs: &[i16], shift: u8) -> Vec<i32> {
    let mut out = Vec::with_capacity(samples.len());
    compute_residuals_into(samples, coeffs, shift, &mut out);
    out
}

/// Computes the same residual vector as the allocating
/// `compute_residuals` wrapper, but writes into the caller's buffer,
/// letting the frame encoder reuse one `Vec` across every order it
/// evaluates. Per-call allocation overhead was measurable in
/// profiling — ~100-200 ns per call across ~15 orders per frame
/// accumulates to a few percent of total encode time.
///
/// `out` is cleared first so the caller doesn't have to. Capacity is
/// reserved up to `samples.len()`; the buffer grows at most once and
/// only on the first call with a larger frame size.
pub fn compute_residuals_into(samples: &[i32], coeffs: &[i16], shift: u8, out: &mut Vec<i32>) {
    out.clear();
    out.reserve(samples.len());

    // Verbatim fast path: `order = 0` means no prediction — the residuals
    // equal the samples. This is also the required fallback for all-zero
    // frames (where Levinson-Durbin can't run) and a trivial short-circuit
    // for the encoder's order-search loop.
    let order = coeffs.len();
    if order == 0 {
        out.extend_from_slice(samples);
        return;
    }

    // Reverse and widen coefficients once up front. The reversal
    // converts the LPC prediction formula from a reversed-access dot
    // product into a forward-forward dot product; widening from i16 to
    // i32 lets LLVM compile the inner mul-accumulate to an i32×i32→i64
    // `imul` (or an auto-vectorised AVX variant) rather than a slower
    // i64×i64 path. The stack array sized for `MAX_LPC_ORDER` avoids a
    // per-call heap allocation.
    //
    // Benchmarks confirm LLVM's autovectoriser handles the resulting
    // forward-forward dot product on par with — or, at small orders,
    // *better* than — a hand-written AVX-512 kernel. An explicit SIMD
    // path here added function-call boundaries that cost more than the
    // SIMD width bought. Keep the code LLVM-friendly and trust the
    // optimiser.
    let mut coeffs_rev_buf = [0i32; MAX_LPC_ORDER as usize];
    for k in 0..order {
        coeffs_rev_buf[k] = coeffs[order - 1 - k] as i32;
    }
    let coeffs_rev = &coeffs_rev_buf[..order];

    // Right-shift amount and round-half-up bias for the Q(15 − shift)
    // coefficient scale. `shift_amt = 15 − shift`; the accumulated
    // `sum` is in a Q(shift_amt) scale relative to the sample units,
    // so `>> shift_amt` rescales it back to sample units.
    // `bias = 1 << (shift_amt − 1)` adds a half-LSB of the shifted
    // quantity — the standard round-to-nearest-even approximation
    // (strictly round-half-up for non-negative, away-from-zero for
    // negative after the shift). Required for bit-exact decoding.
    let shift_amt = 15u32 - shift as u32;
    let bias = 1i64 << (shift_amt - 1);
    let n = samples.len();

    // Warm-up region (`i < order`): fewer predecessors than
    // coefficients. The dot product still uses `coeffs_rev`, but the
    // window is `samples[0..i]` (variable length `i`) and the
    // corresponding coefficient slice is the *last* `i` entries of
    // `coeffs_rev`. Only `order × (order − 1) / 2` iterations across
    // the whole frame — not worth vectorising.
    let warm = order.min(n);
    for i in 0..warm {
        let mut sum: i64 = 0;
        let crev = &coeffs_rev[order - i..];
        let window = &samples[..i];
        for k in 0..i {
            sum += crev[k] as i64 * window[k] as i64;
        }
        // Warm-up at `i == 0`: no predecessors exist, so the sum is
        // empty and prediction is defined as 0 — NOT `(0 + bias) >> s`
        // (spec §3.6 pins this explicitly; a decoder that reaches for
        // the formula here would produce `bias >> s` instead of 0 at
        // shift values where those differ, breaking round-trip).
        let pred = if i == 0 { 0 } else { (sum + bias) >> shift_amt };
        out.push(samples[i] - pred as i32);
    }

    // Steady-state region (`i >= order`): constant-length inner loop,
    // both slices iterated forward. LLVM vectorises this to an AVX2/
    // AVX-512 mul-accumulate when release-mode codegen sees it.
    for i in order..n {
        let window = &samples[i - order..i];
        let mut sum: i64 = 0;
        for k in 0..order {
            sum += coeffs_rev[k] as i64 * window[k] as i64;
        }
        let pred = (sum + bias) >> shift_amt;
        out.push(samples[i] - pred as i32);
    }
}

/// Reconstruct samples from LPC residuals and Q(15 − shift) predictor
/// coefficients.
///
/// `sample[i] = residual[i] + predict(samples[0..i], coeffs, shift)`.
///
/// The prediction uses already-reconstructed samples (causal), so the
/// reconstruction is exact:
/// `lpc_synthesize(compute_residuals(s, c, k), c, k) == s` for any samples
/// `s`, coefficients `c`, and shift `k`. Residuals are transmitted without
/// quantisation, so no information is lost in the round-trip.
///
/// Prediction formula, rounding convention, and warm-up handling are
/// identical to `compute_residuals`.
#[cfg(test)]
pub(crate) fn lpc_synthesize(residuals: &[i32], coeffs: &[i16], shift: u8) -> Vec<i32> {
    let mut out = Vec::with_capacity(residuals.len());
    lpc_synthesize_into(residuals, coeffs, shift, &mut out);
    out
}

/// Reconstruct samples from residuals into the caller's buffer. `out`
/// is cleared first and filled with `residuals.len()` reconstructed
/// samples. For hot-path use — the MCU decode loop reuses one buffer
/// across every incoming frame.
///
/// Prediction formula, rounding convention, and warm-up handling mirror
/// `compute_residuals_into` in reverse.
pub fn lpc_synthesize_into(residuals: &[i32], coeffs: &[i16], shift: u8, out: &mut Vec<i32>) {
    out.clear();
    out.reserve(residuals.len());
    let order = coeffs.len();
    // Mirror of the encoder-side Q format: coefficients in
    // Q(15 − shift), accumulator shifted back by `shift_amt` with a
    // half-LSB `bias` for round-to-nearest. These two constants are
    // normative (spec §3.6): they must match the encoder bit-for-bit
    // so reconstructed samples equal the originals exactly.
    let shift_amt = 15u32 - shift as u32;
    let bias = 1i64 << (shift_amt - 1);
    let samples = out;
    for (i, &res) in residuals.iter().enumerate() {
        let terms = i.min(order);
        // Warm-up: when `terms == 0` (always true at `i == 0`, and
        // possible for `i < order` on short frames), the sum is empty
        // and prediction is defined as 0 — NOT `(0 + bias) >> s`. The
        // formula is not applied in the warm-up region per spec §3.6;
        // this `if` is what enforces that at the synthesis side and
        // must match the encoder's warm-up handling above.
        let pred = if terms == 0 {
            0i64
        } else {
            // Uses already-reconstructed `samples[0..i]`, not the originals.
            // This is what makes the round-trip lossless: encoder and decoder
            // compute predictions from the same reconstructed history.
            let sum: i64 = (0..terms)
                .map(|j| coeffs[j] as i64 * samples[i - j - 1] as i64)
                .sum();
            (sum + bias) >> shift_amt
        };
        // Wrapping add: `pred as i32` is already a wrapping narrow of
        // the i64 accumulator; the final `res + pred` must match that
        // semantics in release AND debug. On well-formed streams the
        // sum stays within the 24-bit sample range and the wrap is
        // never taken — but a malicious bitstream with attacker-chosen
        // residuals can push this arbitrarily, and the decoder is
        // contractually panic-free on every byte sequence (spec §6).
        // Produced output on overflow is "wrong sample, no panic",
        // matching the spec's "substitute silence / discard frame"
        // recovery model at the caller.
        samples.push(res.wrapping_add(pred as i32));
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use alloc::vec;
    use alloc::vec::Vec;

    use crate::test_signals::{angular_step_q32, sin_q15, sine_samples as int_sine_samples};

    /// Generate `n` samples of a sinusoid completing `freq_cycles` full
    /// periods over the block (so angular frequency is
    /// `2π · freq_cycles / n` rad/sample). Integer-only; delegates to
    /// the test-signal LUT in `crate::test_signals`.
    fn sine_samples(n: usize, freq_cycles: u64, amplitude: i32) -> Vec<i32> {
        let step = angular_step_q32(freq_cycles, n as u64);
        int_sine_samples(n, step, amplitude)
    }

    #[test]
    fn residuals_synthesize_roundtrip_all_orders() {
        let samples = sine_samples(960, 4, 50_000);
        // Order 0: empty coefficients, residuals equal samples.
        let (coeffs, shift) = lpc_analyze(&samples, 0).unwrap();
        assert!(coeffs.is_empty());
        assert_eq!(shift, 0);
        // For order 0, shift is ignored but must still be passed. Residuals
        // equal samples because there's no prediction.
        let residuals = compute_residuals(&samples, &coeffs, shift);
        assert_eq!(residuals, samples);
        assert_eq!(lpc_synthesize(&residuals, &coeffs, shift), samples);

        // All non-zero orders up to MAX_LPC_ORDER must round-trip exactly.
        for order in 1..=MAX_LPC_ORDER {
            let (coeffs, shift) = lpc_analyze(&samples, order).unwrap();
            assert_eq!(coeffs.len(), order as usize);
            assert!(shift <= MAX_COEFFICIENT_SHIFT);
            let residuals = compute_residuals(&samples, &coeffs, shift);
            let reconstructed = lpc_synthesize(&residuals, &coeffs, shift);
            assert_eq!(reconstructed, samples, "roundtrip failed at order {order}");
        }
    }

    #[test]
    fn autocorrelation_into_impulse() {
        // Ground-truth case: an impulse `[c, 0, 0, …, 0]` has R[0] = c²
        // and R[k] = 0 for k ≥ 1 by the autocorrelation definition
        // `R[k] = Σ s[i] · s[i+k]`. Catches numerical regressions that
        // round-trip tests would hide because Levinson-Durbin would
        // still invert a drifted R into consistent-if-wrong coefficients.
        const C: i32 = 1_000_000; // well inside the 24-bit contract
        let mut samples = vec![0i32; 32];
        samples[0] = C;
        let order = 8u8;
        let mut r = vec![0i64; order as usize + 1];
        autocorrelation_into(&samples, order, &mut r);
        assert_eq!(r[0], (C as i64) * (C as i64), "R[0] must equal c² exactly");
        for (k, &r_k) in r.iter().enumerate().skip(1) {
            assert_eq!(r_k, 0, "R[{k}] must be zero for an impulse input");
        }
    }

    #[test]
    fn autocorrelation_into_dc() {
        // Constant `[c; N]` has R[k] = (N − k) · c². Verifies the
        // windowed-sum inner loop's bounds: a miscounted upper index
        // would shift every R[k] by c² in one direction.
        const C: i32 = 5_000;
        const N: usize = 64;
        let samples = vec![C; N];
        let order = 4u8;
        let mut r = vec![0i64; order as usize + 1];
        autocorrelation_into(&samples, order, &mut r);
        let c2 = (C as i64) * (C as i64);
        for (k, &r_k) in r.iter().enumerate() {
            let expected = (N - k) as i64 * c2;
            assert_eq!(r_k, expected, "R[{k}] wrong for DC input");
        }
    }

    #[test]
    fn lpc_analyze_levels_matches_lpc_analyze() {
        let samples = sine_samples(960, 6, 100_000);
        let mut levels = LpcLevels::new();
        assert!(lpc_analyze_levels_into(
            &samples,
            MAX_LPC_ORDER,
            &mut levels
        ));
        for order in 1..=MAX_LPC_ORDER {
            let (single_coeffs, single_shift) = lpc_analyze(&samples, order).unwrap();
            let view = levels.get(order);
            assert_eq!(
                view.coefficients,
                &single_coeffs[..],
                "coefficient mismatch at order {order}"
            );
            assert_eq!(view.shift, single_shift, "shift mismatch at order {order}");
        }
    }

    #[test]
    fn all_zero_frame_returns_none() {
        let samples = vec![0i32; 960];
        assert!(lpc_analyze(&samples, 4).is_none());
        let mut levels = LpcLevels::new();
        assert!(!lpc_analyze_levels_into(&samples, 16, &mut levels));
    }

    #[test]
    fn order_zero_returns_empty_coeffs() {
        let samples = sine_samples(960, 4, 1000);
        let (coeffs, shift) = lpc_analyze(&samples, 0).unwrap();
        assert!(coeffs.is_empty());
        assert_eq!(shift, 0);
    }

    #[test]
    fn high_order_24_stays_non_trivial() {
        // Without the E_m tracker, coefficients at order ≥ ~16 collapse
        // toward zero because `num / R[0]` gets smaller every step
        // while R[0] is fixed. Exercise a complex signal (three
        // incommensurate sinusoids) and confirm the order-24
        // coefficients are not all-zero and the residuals are smaller
        // than at order 8.
        //
        // Angular steps approximate the original 0.11, 0.27, 0.43
        // rad/sample via Q32 integer conversions — the exact values
        // don't matter; what matters is that the three frequencies are
        // unrelated enough that order-8 LPC can't fully capture them.
        let n = 1024usize;
        // Q32 step = ω · 2³² / 2π. For ω = 0.11, step ≈ 75_143_389.
        // Computed by hand from the mantissa of 0.11 / (2π).
        let step_a: u32 = 75_143_389; // ω ≈ 0.10991
        let step_b: u32 = 184_443_047; // ω ≈ 0.26979
        let step_c: u32 = 293_742_706; // ω ≈ 0.42967
        let mut samples: Vec<i32> = Vec::with_capacity(n);
        let mut phase_a: u32 = 0;
        let mut phase_b: u32 = 0;
        let mut phase_c: u32 = 0;
        for _ in 0..n {
            let a = (sin_q15(phase_a) as i64 * 40_000 + (1 << 14)) >> 15;
            let b = (sin_q15(phase_b) as i64 * 20_000 + (1 << 14)) >> 15;
            let c = (sin_q15(phase_c) as i64 * 10_000 + (1 << 14)) >> 15;
            samples.push((a + b + c) as i32);
            phase_a = phase_a.wrapping_add(step_a);
            phase_b = phase_b.wrapping_add(step_b);
            phase_c = phase_c.wrapping_add(step_c);
        }

        let (c24, s24) = lpc_analyze(&samples, 24).unwrap();
        assert!(
            c24.iter().any(|&c| c != 0),
            "order-24 coefficients collapsed to zero"
        );

        let (c8, s8) = lpc_analyze(&samples, 8).unwrap();
        let e8: u64 = compute_residuals(&samples, &c8, s8)
            .iter()
            .map(|&r| (r as i64 * r as i64) as u64)
            .sum();
        let e24: u64 = compute_residuals(&samples, &c24, s24)
            .iter()
            .map(|&r| (r as i64 * r as i64) as u64)
            .sum();
        assert!(
            e24 < e8,
            "high-order residual energy should be smaller: e8={e8}, e24={e24}"
        );
    }

    #[test]
    fn dc_signal_residuals_are_small() {
        let samples = vec![10_000i32; 960];

        // Biased autocorrelation `R[k] = (N-k)·c²` (not `N·c²`) gives the
        // reflection coefficient a slight underestimate relative to the
        // theoretical optimum. The residuals are therefore small but not zero.
        let (coeffs, shift) = lpc_analyze(&samples, 4).unwrap();
        let residuals = compute_residuals(&samples, &coeffs, shift);
        let max_residual = residuals[4..].iter().map(|r| r.abs()).max().unwrap_or(0);
        assert!(max_residual <= 20, "DC residuals too large: {max_residual}");
    }

    #[test]
    fn order_32_roundtrip() {
        // 13.5 cycles across 4096 samples = 27 cycles across 8192; pass
        // the doubled ratio to keep the `sine_samples(n, cycles_u64,
        // amp)` signature integer.
        let step = angular_step_q32(27, 8192);
        let samples = int_sine_samples(4096, step, 1_000_000);
        let (coeffs, shift) = lpc_analyze(&samples, 32).unwrap();
        assert_eq!(coeffs.len(), 32);
        let residuals = compute_residuals(&samples, &coeffs, shift);
        let reconstructed = lpc_synthesize(&residuals, &coeffs, shift);
        assert_eq!(reconstructed, samples);
    }

    #[test]
    fn bass_sine_selects_nonzero_shift() {
        // A pure low-frequency sine has optimal `a[1] ≈ −2`, which
        // forces a non-zero `coefficient_shift` to avoid clamping.
        // This test would fail if `min_shift_for` were buggy or if the
        // encoder never bothered widening the Q range.
        //
        // 50 Hz at 48 kHz sample rate → 50/48000 revolutions/sample.
        // In Q32 that's `(50 · 2³²) / 48000 ≈ 4 474 091` per sample, so
        // angular frequency ≈ 2π · 50 / 48000 ≈ 0.00654 rad/sample →
        // cos ≈ 0.99998 → a[1] ≈ −1.9999. Representing this requires
        // Q14 (shift = 1) at minimum.
        let step = angular_step_q32(50, 48_000);
        let samples = int_sine_samples(4096, step, 1_000_000);
        let (_coeffs, shift) = lpc_analyze(&samples, 4).unwrap();
        assert!(
            shift >= 1,
            "low-frequency sine should force shift ≥ 1, got {shift}"
        );
    }

    #[test]
    fn min_shift_for_examples() {
        // At exactly |c| = 2^31 the real value is 1.0, which Q15 cannot
        // represent (its upper bound is 1 − 2^-15), so shift must widen
        // to 1. Just below that, shift = 0 suffices.
        assert_eq!(min_shift_for(&[(1i64 << 31) - 1]), 0);
        assert_eq!(min_shift_for(&[1i64 << 31]), 1);
        assert_eq!(min_shift_for(&[-(1i64 << 31)]), 1);
        assert_eq!(min_shift_for(&[(1i64 << 32) - 1]), 1);
        assert_eq!(min_shift_for(&[1i64 << 32]), 2);
        assert_eq!(min_shift_for(&[0, 0, 0]), 0);
        // Mixed: the max wins.
        assert_eq!(min_shift_for(&[0, 1 << 31, -(1 << 31) + 1]), 1);
    }
}