use crate::flac::constants::{MAX_FIXED_ORDER, MAX_LPC_ORDER};
use crate::flac::error::FlacError;
#[cfg(target_arch = "x86_64")]
#[allow(unsafe_op_in_unsafe_fn)]
mod avx2_lpc {
use std::arch::x86_64::*;
#[target_feature(enable = "avx2")]
#[inline]
pub unsafe fn hsum4_i64(v: __m256i) -> i64 {
let lo = _mm256_castsi256_si128(v);
let hi = _mm256_extracti128_si256::<1>(v);
let s = _mm_add_epi64(lo, hi);
let hi64 = _mm_unpackhi_epi64(s, s);
_mm_cvtsi128_si64(_mm_add_epi64(s, hi64))
}
#[target_feature(enable = "avx2")]
#[inline]
pub unsafe fn dot4(c: *const i32, s_ptr: *const i32) -> i64 {
let c_vec = _mm_loadu_si128(c as *const __m128i);
let s_raw = _mm_loadu_si128(s_ptr as *const __m128i);
let s_rev = _mm_shuffle_epi32(s_raw, 0x1B); let prod = _mm256_mul_epi32(
_mm256_cvtepi32_epi64(c_vec),
_mm256_cvtepi32_epi64(s_rev),
);
hsum4_i64(prod)
}
#[target_feature(enable = "avx2")]
#[inline]
pub unsafe fn dot8(c: *const i32, s_ptr: *const i32) -> i64 {
let c_lo = _mm_loadu_si128(c as *const __m128i);
let c_hi = _mm_loadu_si128(c.add(4) as *const __m128i);
let s_lo_rev = _mm_shuffle_epi32(_mm_loadu_si128(s_ptr.add(4) as *const __m128i), 0x1B);
let s_hi_rev = _mm_shuffle_epi32(_mm_loadu_si128(s_ptr as *const __m128i), 0x1B);
let sum = _mm256_add_epi64(
_mm256_mul_epi32(_mm256_cvtepi32_epi64(c_lo), _mm256_cvtepi32_epi64(s_lo_rev)),
_mm256_mul_epi32(_mm256_cvtepi32_epi64(c_hi), _mm256_cvtepi32_epi64(s_hi_rev)),
);
hsum4_i64(sum)
}
#[target_feature(enable = "avx2")]
#[inline]
pub unsafe fn dot12(c: *const i32, s_ptr: *const i32) -> i64 {
let c0 = _mm_loadu_si128(c as *const __m128i);
let c1 = _mm_loadu_si128(c.add(4) as *const __m128i);
let c2 = _mm_loadu_si128(c.add(8) as *const __m128i);
let s0 = _mm_shuffle_epi32(_mm_loadu_si128(s_ptr.add(8) as *const __m128i), 0x1B);
let s1 = _mm_shuffle_epi32(_mm_loadu_si128(s_ptr.add(4) as *const __m128i), 0x1B);
let s2 = _mm_shuffle_epi32(_mm_loadu_si128(s_ptr as *const __m128i), 0x1B);
let sum = _mm256_add_epi64(
_mm256_add_epi64(
_mm256_mul_epi32(_mm256_cvtepi32_epi64(c0), _mm256_cvtepi32_epi64(s0)),
_mm256_mul_epi32(_mm256_cvtepi32_epi64(c1), _mm256_cvtepi32_epi64(s1)),
),
_mm256_mul_epi32(_mm256_cvtepi32_epi64(c2), _mm256_cvtepi32_epi64(s2)),
);
hsum4_i64(sum)
}
#[target_feature(enable = "avx2")]
#[inline]
pub unsafe fn dot_order<const ORDER: usize>(c: *const i32, s: *const i32, si: usize) -> i64 {
match ORDER {
1 => *c as i64 * *s.add(si-1) as i64,
2 => *c as i64 * *s.add(si-1) as i64
+ *c.add(1) as i64 * *s.add(si-2) as i64,
3 => *c as i64 * *s.add(si-1) as i64
+ *c.add(1) as i64 * *s.add(si-2) as i64
+ *c.add(2) as i64 * *s.add(si-3) as i64,
4 => dot4(c, s.add(si-4)),
5 => dot4(c, s.add(si-4))
+ *c.add(4) as i64 * *s.add(si-5) as i64,
6 => dot4(c, s.add(si-4))
+ *c.add(4) as i64 * *s.add(si-5) as i64
+ *c.add(5) as i64 * *s.add(si-6) as i64,
7 => dot4(c, s.add(si-4))
+ *c.add(4) as i64 * *s.add(si-5) as i64
+ *c.add(5) as i64 * *s.add(si-6) as i64
+ *c.add(6) as i64 * *s.add(si-7) as i64,
8 => dot8(c, s.add(si-8)),
9 => dot8(c, s.add(si-8))
+ *c.add(8) as i64 * *s.add(si-9) as i64,
10 => dot8(c, s.add(si-8))
+ *c.add(8) as i64 * *s.add(si-9) as i64
+ *c.add(9) as i64 * *s.add(si-10) as i64,
11 => dot8(c, s.add(si-8))
+ *c.add(8) as i64 * *s.add(si-9) as i64
+ *c.add(9) as i64 * *s.add(si-10) as i64
+ *c.add(10) as i64 * *s.add(si-11) as i64,
12 => dot12(c, s.add(si-12)),
_ => {
let mut acc = 0i64;
for j in 0..ORDER { acc += *c.add(j) as i64 * *s.add(si-1-j) as i64; }
acc
}
}
}
#[target_feature(enable = "avx2")]
pub unsafe fn lpc_residuals4<const ORDER: usize>(
s: *const i32, c: *const i32, shift: i32, out: *mut i32, n: usize,
) {
let ushift = shift as u64;
let fill_bits: u64 = if ushift == 0 { 0 } else { !0u64 << (64 - ushift) };
let shift_v = _mm256_set1_epi64x(ushift as i64);
let fill_v = _mm256_set1_epi64x(fill_bits as i64);
let zero = _mm256_setzero_si256();
let narrow_perm = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
let mut c_i32 = [0i32; ORDER];
for (j, item) in c_i32.iter_mut().enumerate().take(ORDER) {
*item = *c.add(j);
}
let n4 = (n / 4) * 4;
let mut k = 0usize;
while k < n4 {
let base = ORDER + k;
let mut acc = _mm256_setzero_si256();
for (j, &cj) in c_i32.iter().enumerate().take(ORDER) {
let cj = _mm256_set1_epi64x(cj as i64);
let s4 = _mm_loadu_si128(s.add(base - 1 - j) as *const __m128i);
let s4e = _mm256_cvtepi32_epi64(s4);
acc = _mm256_add_epi64(acc, _mm256_mul_epi32(cj, s4e));
}
let lshifted = _mm256_srlv_epi64(acc, shift_v);
let sign_mask = _mm256_cmpgt_epi64(zero, acc); let pred = _mm256_or_si256(lshifted, _mm256_and_si256(sign_mask, fill_v));
let actual4 = _mm_loadu_si128(s.add(base) as *const __m128i);
let actual4e = _mm256_cvtepi32_epi64(actual4);
let res64 = _mm256_sub_epi64(actual4e, pred);
let narrow = _mm256_permutevar8x32_epi32(res64, narrow_perm);
_mm_storeu_si128(out.add(k) as *mut __m128i, _mm256_castsi256_si128(narrow));
k += 4;
}
while k < n {
let i = ORDER + k;
let pred = dot_order::<ORDER>(c, s, i) >> shift;
*out.add(k) = (*s.add(i) as i64 - pred) as i32;
k += 1;
}
}
}
#[inline(always)]
unsafe fn restore_sample<const ORDER: usize>(
buf: *const i32, cv: &[i32; ORDER], si: usize,
) -> i64 {
let mut sum = 0i64;
for (j, item) in cv.iter().enumerate().take(ORDER) {
sum += *item as i64 * unsafe { *buf.add(si - 1 - j) } as i64;
}
sum
}
#[inline(always)]
unsafe fn lpc_restore_fast<const ORDER: usize>(
buf: *mut i32, res: *const i32, n: usize, c: *const i32, shift: i32,
) {
let mut cv = [0i32; ORDER];
for (j, item) in cv.iter_mut().enumerate().take(ORDER) {
unsafe { *item = *c.add(j); }
}
for k in 0..n {
let si = ORDER + k;
let pred = unsafe { restore_sample::<ORDER>(buf, &cv, si) } >> shift;
let residual = unsafe { *res.add(k) } as i64;
unsafe { *buf.add(si) = (pred + residual) as i32; }
}
}
pub fn fixed_predictor_residual(samples: &[i32], order: usize) -> Result<Vec<i32>, FlacError> {
if order > MAX_FIXED_ORDER {
return Err(FlacError::InvalidFixedOrder { order: order as u8 });
}
if samples.len() < order {
return Err(FlacError::InvalidBlockSize { size: samples.len() as u32 });
}
let n = samples.len();
let out_len = n - order;
let mut residuals = Vec::with_capacity(out_len);
let ptr: *mut i32 = residuals.as_mut_ptr();
match order {
0 => {
residuals.extend_from_slice(samples);
return Ok(residuals);
}
1 => {
for i in 0..out_len {
unsafe { ptr.add(i).write(samples[i + 1].wrapping_sub(samples[i])); }
}
}
2 => {
for i in 0..out_len {
let r = samples[i + 2]
.wrapping_sub(samples[i + 1].wrapping_shl(1))
.wrapping_add(samples[i]);
unsafe { ptr.add(i).write(r); }
}
}
3 => {
for i in 0..out_len {
let r = samples[i + 3]
.wrapping_sub(samples[i + 2].wrapping_mul(3))
.wrapping_add(samples[i + 1].wrapping_mul(3))
.wrapping_sub(samples[i]);
unsafe { ptr.add(i).write(r); }
}
}
4 => {
for i in 0..out_len {
let r = samples[i + 4]
.wrapping_sub(samples[i + 3].wrapping_shl(2))
.wrapping_add(samples[i + 2].wrapping_mul(6))
.wrapping_sub(samples[i + 1].wrapping_shl(2))
.wrapping_add(samples[i]);
unsafe { ptr.add(i).write(r); }
}
}
_ => unreachable!(),
}
unsafe { residuals.set_len(out_len); }
Ok(residuals)
}
pub fn fixed_predictor_restore(
warmup: &[i32],
residuals: &[i32],
order: usize,
) -> Result<Vec<i32>, FlacError> {
if order > MAX_FIXED_ORDER {
return Err(FlacError::InvalidFixedOrder { order: order as u8 });
}
if warmup.len() < order {
return Err(FlacError::InvalidBlockSize {
size: warmup.len() as u32,
});
}
let mut samples = Vec::with_capacity(order + residuals.len());
samples.extend_from_slice(&warmup[..order]);
match order {
0 => {
for &r in residuals {
samples.push(r);
}
}
1 => {
for &r in residuals {
let i = samples.len();
let p = samples[i - 1] as i64;
samples.push((p + r as i64) as i32);
}
}
2 => {
for &r in residuals {
let i = samples.len();
let p = 2 * samples[i - 1] as i64 - samples[i - 2] as i64;
samples.push((p + r as i64) as i32);
}
}
3 => {
for &r in residuals {
let i = samples.len();
let p = 3 * samples[i - 1] as i64
- 3 * samples[i - 2] as i64
+ samples[i - 3] as i64;
samples.push((p + r as i64) as i32);
}
}
4 => {
for &r in residuals {
let i = samples.len();
let p = 4 * samples[i - 1] as i64
- 6 * samples[i - 2] as i64
+ 4 * samples[i - 3] as i64
- samples[i - 4] as i64;
samples.push((p + r as i64) as i32);
}
}
_ => unreachable!("fixed predictor order > 4 should have been caught above"),
}
Ok(samples)
}
pub(crate) fn fixed_predictor_restore_into(
buf: &mut [i32],
offset: usize,
order: usize,
block_size: usize,
) -> Result<(), FlacError> {
if order > MAX_FIXED_ORDER {
return Err(FlacError::InvalidFixedOrder { order: order as u8 });
}
let end = offset + block_size;
match order {
0 => {}
1 => {
let mut s = buf[offset];
for r in buf.iter_mut().take(end).skip(offset + 1) {
let ns = s.wrapping_add(*r);
*r = ns;
s = ns;
}
}
2 => {
let mut s0 = buf[offset];
let mut s1 = buf[offset + 1];
for r in buf.iter_mut().take(end).skip(offset + 2) {
let s2 = s1.wrapping_shl(1).wrapping_sub(s0).wrapping_add(*r);
*r = s2;
s0 = s1;
s1 = s2;
}
}
3 => {
let mut s0 = buf[offset];
let mut s1 = buf[offset + 1];
let mut s2 = buf[offset + 2];
for r in buf.iter_mut().take(end).skip(offset + 3) {
let s3 = s2.wrapping_mul(3)
.wrapping_sub(s1.wrapping_mul(3))
.wrapping_add(s0)
.wrapping_add(*r);
*r = s3;
s0 = s1;
s1 = s2;
s2 = s3;
}
}
4 => {
let mut s0 = buf[offset];
let mut s1 = buf[offset + 1];
let mut s2 = buf[offset + 2];
let mut s3 = buf[offset + 3];
for r in buf.iter_mut().take(end).skip(offset + 4) {
let s4 = s3.wrapping_shl(2)
.wrapping_sub(s2.wrapping_mul(6))
.wrapping_add(s1.wrapping_shl(2))
.wrapping_sub(s0)
.wrapping_add(*r);
*r = s4;
s0 = s1; s1 = s2; s2 = s3; s3 = s4;
}
}
_ => unreachable!(),
}
Ok(())
}
pub(crate) fn lpc_predictor_restore_into(
buf: &mut Vec<i32>,
offset: usize,
order: usize,
qlp_coeffs: &[i32],
qlp_shift: i8,
) -> Result<(), FlacError> {
if order == 0 || order > MAX_LPC_ORDER {
return Err(FlacError::InvalidLpcOrder { order: order as u8 });
}
if qlp_shift < 0 {
return Err(FlacError::InvalidLpcShift { shift: qlp_shift });
}
let n = buf.len() - offset - order;
let shift = qlp_shift as i32;
let c = qlp_coeffs.as_ptr();
let p = unsafe { buf.as_mut_ptr().add(offset) };
macro_rules! dispatch_restore_into {
($N:literal) => { unsafe {
lpc_restore_fast_into::<$N>(p, n, c, shift);
}};
}
match order {
1 => dispatch_restore_into!(1), 2 => dispatch_restore_into!(2),
3 => dispatch_restore_into!(3), 4 => dispatch_restore_into!(4),
5 => dispatch_restore_into!(5), 6 => dispatch_restore_into!(6),
7 => dispatch_restore_into!(7), 8 => dispatch_restore_into!(8),
9 => dispatch_restore_into!(9), 10 => dispatch_restore_into!(10),
11 => dispatch_restore_into!(11), 12 => dispatch_restore_into!(12),
_ => {
for k in 0..n {
let si = offset + order + k;
let residual = buf[si] as i64;
let mut pred: i64 = 0;
for (j, &coeff) in qlp_coeffs.iter().enumerate() {
pred += coeff as i64 * buf[si - 1 - j] as i64;
}
buf[si] = ((pred >> shift) + residual) as i32;
}
}
}
Ok(())
}
unsafe fn lpc_restore_fast_into<const ORDER: usize>(
buf: *mut i32, n: usize, c: *const i32, shift: i32,
) {
let mut cv = [0i32; ORDER];
for (j, item) in cv.iter_mut().enumerate().take(ORDER) {
unsafe { *item = *c.add(j); }
}
for k in 0..n {
let si = ORDER + k;
let residual = unsafe { *buf.add(si) } as i64;
let pred = unsafe { restore_sample::<ORDER>(buf, &cv, si) } >> shift;
unsafe { *buf.add(si) = (pred + residual) as i32; }
}
}
pub fn find_best_fixed_order(samples: &[i32]) -> usize {
let n = samples.len();
if n <= 4 {
return 0;
}
let max_order = MAX_FIXED_ORDER.min(n - 1);
let mut e = [0u64; 5];
e[0] = samples[0].unsigned_abs() as u64;
let r1_i1 = samples[1].wrapping_sub(samples[0]);
e[0] += samples[1].unsigned_abs() as u64;
e[1] += r1_i1.unsigned_abs() as u64;
let r1_i2 = samples[2].wrapping_sub(samples[1]);
let r2_i2 = r1_i2.wrapping_sub(r1_i1);
e[0] += samples[2].unsigned_abs() as u64;
e[1] += r1_i2.unsigned_abs() as u64;
e[2] += r2_i2.unsigned_abs() as u64;
let r1_i3 = samples[3].wrapping_sub(samples[2]);
let r2_i3 = r1_i3.wrapping_sub(r1_i2);
let r3_i3 = r2_i3.wrapping_sub(r2_i2);
e[0] += samples[3].unsigned_abs() as u64;
e[1] += r1_i3.unsigned_abs() as u64;
e[2] += r2_i3.unsigned_abs() as u64;
e[3] += r3_i3.unsigned_abs() as u64;
let mut prev = samples[3];
let mut d1 = r1_i3;
let mut d2 = r2_i3;
let mut d3 = r3_i3;
for cur in samples.iter().take(n).skip(4) {
let r1 = cur.wrapping_sub(prev);
let r2 = r1.wrapping_sub(d1);
let r3 = r2.wrapping_sub(d2);
let r4 = r3.wrapping_sub(d3);
e[0] += cur.unsigned_abs() as u64;
e[1] += r1.unsigned_abs() as u64;
e[2] += r2.unsigned_abs() as u64;
e[3] += r3.unsigned_abs() as u64;
e[4] += r4.unsigned_abs() as u64;
prev = *cur;
d1 = r1;
d2 = r2;
d3 = r3;
}
e[..=max_order]
.iter()
.enumerate()
.min_by_key(|&(_, &v)| v)
.map(|(i, _)| i)
.unwrap_or(0)
}
pub fn autocorrelation(samples: &[i32], max_order: usize) -> Vec<f64> {
#[cfg(target_arch = "x86_64")]
{
autocorrelation_simd(samples, max_order)
}
#[cfg(not(target_arch = "x86_64"))]
{
autocorrelation_scalar(samples, max_order)
}
}
#[cfg(target_arch = "x86_64")]
#[allow(unsafe_op_in_unsafe_fn)]
#[target_feature(enable = "avx2,fma")]
unsafe fn autocorrelation_avx2_fma(samples: &[i32], max_order: usize) -> Vec<f64> {
use std::arch::x86_64::*;
let n = samples.len();
let n_lags = max_order + 1;
let mut r = vec![0.0f64; n_lags];
if n == 0 { return r; }
let s = samples.as_ptr();
let full_groups = n_lags / 4;
for g in 0..full_groups {
let l0 = g * 4;
let l3 = l0 + 3;
for i in l0..l3.min(n) {
let si = *s.add(i) as f64;
r[l0] += si * *s.add(i - l0) as f64;
if i > l0 { r[l0 + 1] += si * *s.add(i - l0 - 1) as f64; }
if i >= l0 + 2 { r[l0 + 2] += si * *s.add(i - l0 - 2) as f64; }
}
if n <= l3 { continue; }
let mut a0 = _mm256_setzero_pd();
let mut a1 = _mm256_setzero_pd();
let mut a2 = _mm256_setzero_pd();
let mut a3 = _mm256_setzero_pd();
let n4 = l3 + ((n - l3) / 4) * 4;
let mut i = l3;
while i < n4 {
let p = s.add(i - l3); a0 = _mm256_fmadd_pd(
_mm256_set1_pd(*s.add(i) as f64),
_mm256_cvtepi32_pd(_mm_loadu_si128(p as *const _)),
a0,
);
a1 = _mm256_fmadd_pd(
_mm256_set1_pd(*s.add(i + 1) as f64),
_mm256_cvtepi32_pd(_mm_loadu_si128(p.add(1) as *const _)),
a1,
);
a2 = _mm256_fmadd_pd(
_mm256_set1_pd(*s.add(i + 2) as f64),
_mm256_cvtepi32_pd(_mm_loadu_si128(p.add(2) as *const _)),
a2,
);
a3 = _mm256_fmadd_pd(
_mm256_set1_pd(*s.add(i + 3) as f64),
_mm256_cvtepi32_pd(_mm_loadu_si128(p.add(3) as *const _)),
a3,
);
i += 4;
}
let mut acc = _mm256_add_pd(_mm256_add_pd(a0, a1), _mm256_add_pd(a2, a3));
while i < n {
let si_f = _mm256_set1_pd(*s.add(i) as f64);
let s4f = _mm256_cvtepi32_pd(_mm_loadu_si128(s.add(i - l3) as *const _));
acc = _mm256_fmadd_pd(si_f, s4f, acc);
i += 1;
}
let mut tmp = [0.0f64; 4];
_mm256_storeu_pd(tmp.as_mut_ptr(), acc);
r[l0] += tmp[3]; r[l0 + 1] += tmp[2]; r[l0 + 2] += tmp[1]; r[l0 + 3] += tmp[0]; }
let base = full_groups * 4;
for (lag, r) in r.iter_mut().enumerate().take(n_lags).skip(base) {
let mut a0 = 0.0f64;
let mut a1 = 0.0f64;
let mut a2 = 0.0f64;
let mut a3 = 0.0f64;
let n4 = lag + ((n - lag) / 4) * 4;
let mut i = lag;
while i < n4 {
a0 += *s.add(i) as f64 * *s.add(i - lag) as f64;
a1 += *s.add(i + 1) as f64 * *s.add(i + 1 - lag) as f64;
a2 += *s.add(i + 2) as f64 * *s.add(i + 2 - lag) as f64;
a3 += *s.add(i + 3) as f64 * *s.add(i + 3 - lag) as f64;
i += 4;
}
let mut sum = (a0 + a1) + (a2 + a3);
while i < n {
sum += *s.add(i) as f64 * *s.add(i - lag) as f64;
i += 1;
}
*r = sum;
}
r
}
#[cfg(target_arch = "x86_64")]
#[allow(unsafe_op_in_unsafe_fn)]
#[target_feature(enable = "avx2")]
unsafe fn autocorrelation_avx2(samples: &[i32], max_order: usize) -> Vec<f64> {
use std::arch::x86_64::*;
let n = samples.len();
let n_lags = max_order + 1;
let mut r = vec![0.0f64; n_lags];
if n == 0 { return r; }
let s = samples.as_ptr();
let full_groups = n_lags / 4;
for g in 0..full_groups {
let l0 = g * 4;
let l3 = l0 + 3;
for i in l0..l3.min(n) {
let si = *s.add(i) as f64;
r[l0] += si * *s.add(i - l0) as f64;
if i > l0 { r[l0 + 1] += si * *s.add(i - l0 - 1) as f64; }
if i >= l0 + 2 { r[l0 + 2] += si * *s.add(i - l0 - 2) as f64; }
}
let mut acc = _mm256_setzero_pd();
for i in l3..n {
let si_f = _mm256_set1_pd(*s.add(i) as f64);
let s4i = _mm_loadu_si128(s.add(i - l3) as *const _);
let s4f = _mm256_cvtepi32_pd(s4i);
acc = _mm256_add_pd(acc, _mm256_mul_pd(si_f, s4f));
}
let mut tmp = [0.0f64; 4];
_mm256_storeu_pd(tmp.as_mut_ptr(), acc);
r[l0] += tmp[3];
r[l0 + 1] += tmp[2];
r[l0 + 2] += tmp[1];
r[l0 + 3] += tmp[0];
}
let base = full_groups * 4;
for (lag, r) in r.iter_mut().enumerate().take(n_lags).skip(base) {
let mut sum = 0.0f64;
for i in lag..n {
sum += *s.add(i) as f64 * *s.add(i - lag) as f64;
}
*r = sum;
}
r
}
#[cfg(target_arch = "x86_64")]
fn autocorrelation_simd(samples: &[i32], max_order: usize) -> Vec<f64> {
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma") {
return unsafe { autocorrelation_avx2_fma(samples, max_order) };
}
if is_x86_feature_detected!("avx2") {
return unsafe { autocorrelation_avx2(samples, max_order) };
}
}
autocorrelation_scalar(samples, max_order)
}
fn autocorrelation_scalar(samples: &[i32], max_order: usize) -> Vec<f64> {
let n = samples.len();
let mut r = vec![0.0f64; max_order + 1];
for lag in 0..=max_order {
let mut sum = 0.0f64;
for i in lag..n {
sum += samples[i] as f64 * samples[i - lag] as f64;
}
r[lag] = sum;
}
r
}
pub fn levinson_durbin(r: &[f64], order: usize) -> Result<(Vec<f64>, f64), FlacError> {
if r.is_empty() || r[0] <= 0.0 {
return Err(FlacError::LpcCoefficientOverflow);
}
let order = order.min(r.len() - 1);
let mut a = vec![0.0f64; order];
let mut a_prev = vec![0.0f64; order];
let mut error = r[0];
for i in 0..order {
let mut sum = r[i + 1];
for j in 0..i {
sum -= a_prev[j] * r[i - j];
}
if error.abs() < 1e-10 {
break;
}
let k = sum / error;
a[i] = k;
for j in 0..i {
a[j] = a_prev[j] - k * a_prev[i - 1 - j];
}
error *= 1.0 - k * k;
a_prev[..=i].copy_from_slice(&a[..=i]);
}
Ok((a, error))
}
pub fn compute_lpc_coefficients(samples: &[i32], order: usize) -> Result<Vec<f64>, FlacError> {
if order == 0 || order > MAX_LPC_ORDER {
return Err(FlacError::InvalidLpcOrder { order: order as u8 });
}
if samples.len() <= order {
return Err(FlacError::InvalidBlockSize {
size: samples.len() as u32,
});
}
let r = autocorrelation(samples, order);
let (coeffs, _error) = levinson_durbin(&r, order)?;
Ok(coeffs)
}
pub fn quantize_lpc_coefficients(
coeffs: &[f64],
precision: u8,
) -> Result<(Vec<i32>, i8), FlacError> {
if coeffs.is_empty() {
return Ok((vec![], 0));
}
let max_coeff = coeffs.iter().map(|c| c.abs()).fold(0.0f64, f64::max);
if max_coeff < 1e-10 {
return Ok((vec![0i32; coeffs.len()], 0));
}
let log2_max = max_coeff.log2();
let shift = (precision as i32 - 1) - log2_max.ceil() as i32;
let shift = shift.clamp(-16, 15) as i8;
let scale = 2.0f64.powi(shift as i32);
let max_val = (1i64 << (precision - 1)) - 1;
let min_val = -(1i64 << (precision - 1));
let mut quantized = Vec::with_capacity(coeffs.len());
for &c in coeffs {
let q = (c * scale).round() as i64;
let clamped = q.clamp(min_val, max_val) as i32;
quantized.push(clamped);
}
Ok((quantized, shift))
}
pub fn lpc_predictor_residual(
samples: &[i32],
qlp_coeffs: &[i32],
qlp_shift: i8,
) -> Result<Vec<i32>, FlacError> {
let order = qlp_coeffs.len();
if order == 0 || order > MAX_LPC_ORDER {
return Err(FlacError::InvalidLpcOrder { order: order as u8 });
}
if samples.len() < order {
return Err(FlacError::InvalidBlockSize {
size: samples.len() as u32,
});
}
if qlp_shift < 0 {
return Err(FlacError::InvalidLpcShift { shift: qlp_shift });
}
let n_out = samples.len() - order;
let mut residuals = Vec::with_capacity(n_out);
#[cfg(target_arch = "x86_64")]
if is_x86_feature_detected!("avx2") {
use avx2_lpc::lpc_residuals4;
let s = samples.as_ptr();
let out: *mut i32 = residuals.as_mut_ptr();
let shift = qlp_shift as i32;
let c = qlp_coeffs.as_ptr();
macro_rules! dispatch {
($N:literal) => { unsafe {
lpc_residuals4::<$N>(s, c, shift, out, n_out);
residuals.set_len(n_out);
}};
}
match order {
1 => dispatch!(1), 2 => dispatch!(2), 3 => dispatch!(3),
4 => dispatch!(4), 5 => dispatch!(5), 6 => dispatch!(6),
7 => dispatch!(7), 8 => dispatch!(8), 9 => dispatch!(9),
10 => dispatch!(10), 11 => dispatch!(11), 12 => dispatch!(12),
_ => {
for i in order..samples.len() {
let mut pred: i64 = 0;
for (j, &c) in qlp_coeffs.iter().enumerate() {
pred += c as i64 * samples[i-1-j] as i64;
}
pred >>= qlp_shift;
residuals.push((samples[i] as i64 - pred) as i32);
}
}
}
return Ok(residuals);
}
macro_rules! lpc_residuals_unrolled {
($N:literal) => {{
let c = &qlp_coeffs[..$N];
for i in $N..samples.len() {
let mut pred: i64 = 0;
for j in 0..$N { pred += c[j] as i64 * samples[i - 1 - j] as i64; }
pred >>= qlp_shift;
residuals.push((samples[i] as i64 - pred) as i32);
}
}};
}
match order {
1 => lpc_residuals_unrolled!(1),
2 => lpc_residuals_unrolled!(2),
3 => lpc_residuals_unrolled!(3),
4 => lpc_residuals_unrolled!(4),
5 => lpc_residuals_unrolled!(5),
6 => lpc_residuals_unrolled!(6),
7 => lpc_residuals_unrolled!(7),
8 => lpc_residuals_unrolled!(8),
12 => lpc_residuals_unrolled!(12),
_ => {
for i in order..samples.len() {
let mut pred: i64 = 0;
for (j, &c) in qlp_coeffs.iter().enumerate() {
pred += c as i64 * samples[i - 1 - j] as i64;
}
pred >>= qlp_shift;
residuals.push((samples[i] as i64 - pred) as i32);
}
}
}
Ok(residuals)
}
pub fn lpc_predictor_restore(
warmup: &[i32],
residuals: &[i32],
qlp_coeffs: &[i32],
qlp_shift: i8,
) -> Result<Vec<i32>, FlacError> {
let order = qlp_coeffs.len();
if order == 0 || order > MAX_LPC_ORDER {
return Err(FlacError::InvalidLpcOrder { order: order as u8 });
}
if warmup.len() < order {
return Err(FlacError::InvalidBlockSize {
size: warmup.len() as u32,
});
}
if qlp_shift < 0 {
return Err(FlacError::InvalidLpcShift { shift: qlp_shift });
}
let total = order + residuals.len();
let mut samples = Vec::with_capacity(total);
samples.extend_from_slice(&warmup[..order]);
let shift = qlp_shift as i32;
let c = qlp_coeffs.as_ptr();
samples.reserve(residuals.len());
let buf = samples.as_mut_ptr();
let res_ptr = residuals.as_ptr();
let n = residuals.len();
macro_rules! dispatch_restore {
($N:literal) => { unsafe {
lpc_restore_fast::<$N>(buf, res_ptr, n, c, shift);
samples.set_len(total);
}};
}
match order {
1 => dispatch_restore!(1), 2 => dispatch_restore!(2),
3 => dispatch_restore!(3), 4 => dispatch_restore!(4),
5 => dispatch_restore!(5), 6 => dispatch_restore!(6),
7 => dispatch_restore!(7), 8 => dispatch_restore!(8),
9 => dispatch_restore!(9), 10 => dispatch_restore!(10),
11 => dispatch_restore!(11), 12 => dispatch_restore!(12),
_ => {
for &residual in residuals {
let i = samples.len();
let mut pred: i64 = 0;
for (j, &c) in qlp_coeffs.iter().enumerate() {
pred += c as i64 * samples[i-1-j] as i64;
}
pred >>= qlp_shift;
samples.push((pred + residual as i64) as i32);
}
}
}
Ok(samples)
}
struct LdAllCoeffs {
data: [[f64; MAX_LPC_ORDER]; MAX_LPC_ORDER],
errors: [f64; MAX_LPC_ORDER],
n: usize,
}
fn levinson_durbin_all(r: &[f64], max_order: usize) -> Result<LdAllCoeffs, FlacError> {
if r.is_empty() || r[0] <= 0.0 {
return Err(FlacError::LpcCoefficientOverflow);
}
let order = max_order.min(r.len() - 1);
let mut ld = LdAllCoeffs {
data: [[0.0f64; MAX_LPC_ORDER]; MAX_LPC_ORDER],
errors: [0.0f64; MAX_LPC_ORDER],
n: order,
};
let mut a = [0.0f64; MAX_LPC_ORDER];
let mut a_prev = [0.0f64; MAX_LPC_ORDER];
let mut error = r[0];
for i in 0..order {
let mut sum = r[i + 1];
for j in 0..i {
sum -= a_prev[j] * r[i - j];
}
if error.abs() < 1e-10 {
for o in i..order {
ld.data[o][..=i].copy_from_slice(&a[..=i]);
ld.errors[o] = 0.0;
}
return Ok(ld);
}
let k = sum / error;
a[i] = k;
for j in 0..i {
a[j] = a_prev[j] - k * a_prev[i - 1 - j];
}
error *= 1.0 - k * k;
a_prev[..=i].copy_from_slice(&a[..=i]);
ld.data[i][..=i].copy_from_slice(&a[..=i]);
ld.errors[i] = error;
}
Ok(ld)
}
pub fn find_best_lpc_order(
samples: &[i32],
max_order: usize,
qlp_precision: u8,
exhaustive: bool,
) -> Result<(usize, Vec<i32>, i8), FlacError> {
let max_order = max_order.min(MAX_LPC_ORDER).min(samples.len() - 1);
if max_order == 0 {
return Ok((0, vec![], 0));
}
let r = autocorrelation(samples, max_order);
let ld = levinson_durbin_all(&r, max_order)?;
let mut best_order = 1;
let mut best_coeffs = vec![];
let mut best_shift = 0i8;
let mut best_cost = f64::MAX;
let orders: &[usize] = if exhaustive {
let mut arr = [0usize; MAX_LPC_ORDER];
for (i, item) in arr.iter_mut().enumerate().take(max_order) { *item = i + 1; }
return find_best_from_orders(qlp_precision, &r, &ld, &arr[..max_order]);
} else {
match max_order {
0 => return Ok((0, vec![], 0)),
1 => &[1][..],
2 => &[1, 2][..],
3 => &[1, 2, 3][..],
4 => &[1, 2, 4][..],
5 => &[1, 2, 4, 5][..],
6 => &[1, 2, 4, 6][..],
7 => &[1, 2, 4, 6, 7][..],
8 => &[1, 2, 4, 6, 8][..],
9 => &[1, 2, 4, 6, 8, 9][..],
10 => &[1, 2, 4, 6, 8, 10][..],
11 => &[1, 2, 4, 6, 8, 10, 11][..],
_ => &[1, 2, 4, 6, 8, 10, 12][..], }
};
for &order in orders {
if order > ld.n { continue; }
let coeffs = &ld.data[order - 1][..order];
let (qlp_coeffs, qlp_shift) = quantize_lpc_coefficients(coeffs, qlp_precision)?;
let cost = lpc_order_cost(&r, coeffs, &qlp_coeffs, qlp_shift, ld.errors[order - 1], qlp_precision, order);
if cost < best_cost {
best_cost = cost;
best_order = order;
best_coeffs = qlp_coeffs;
best_shift = qlp_shift;
}
}
Ok((best_order, best_coeffs, best_shift))
}
#[inline]
fn lpc_order_cost(
r: &[f64],
float_coeffs: &[f64],
qlp_coeffs: &[i32],
qlp_shift: i8,
burg_error: f64,
qlp_precision: u8,
order: usize,
) -> f64 {
let scale = 2.0f64.powi(-(qlp_shift as i32));
let quant_noise: f64 = float_coeffs.iter().zip(qlp_coeffs.iter()).enumerate()
.map(|(j, (&fc, &qc))| {
let delta = fc - qc as f64 * scale;
delta * delta * r.get(j + 1).copied().unwrap_or(0.0)
})
.sum();
let coeff_penalty = (order * qlp_precision as usize) as f64 * r[0] * 1e-6;
burg_error + quant_noise + coeff_penalty
}
fn find_best_from_orders(
qlp_precision: u8,
r: &[f64],
ld: &LdAllCoeffs,
orders: &[usize],
) -> Result<(usize, Vec<i32>, i8), FlacError> {
let mut best_order = orders.first().copied().unwrap_or(1);
let mut best_coeffs = vec![];
let mut best_shift = 0i8;
let mut best_cost = f64::MAX;
for &order in orders {
if order > ld.n { continue; }
let coeffs = &ld.data[order - 1][..order];
let (qlp_coeffs, qlp_shift) = quantize_lpc_coefficients(coeffs, qlp_precision)?;
let cost = lpc_order_cost(r, coeffs, &qlp_coeffs, qlp_shift, ld.errors[order - 1], qlp_precision, order);
if cost < best_cost {
best_cost = cost;
best_order = order;
best_coeffs = qlp_coeffs;
best_shift = qlp_shift;
}
}
Ok((best_order, best_coeffs, best_shift))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_fixed_predictor_order_0() {
let samples = vec![10, 20, 30, 40, 50];
let residuals = fixed_predictor_residual(&samples, 0).unwrap();
assert_eq!(residuals, samples);
}
#[test]
fn test_fixed_predictor_order_1() {
let samples = vec![10, 20, 30, 40, 50];
let residuals = fixed_predictor_residual(&samples, 1).unwrap();
assert_eq!(residuals, vec![10, 10, 10, 10]); }
#[test]
fn test_fixed_predictor_order_2() {
let samples = vec![10, 15, 20, 25, 30];
let residuals = fixed_predictor_residual(&samples, 2).unwrap();
assert_eq!(residuals, vec![0, 0, 0]); }
#[test]
fn test_fixed_predictor_roundtrip() {
let original = vec![100, 150, 180, 200, 250, 280, 320];
for order in 0..=4 {
let residuals = fixed_predictor_residual(&original, order).unwrap();
let restored = fixed_predictor_restore(&original[..order], &residuals, order).unwrap();
assert_eq!(restored, original, "Roundtrip failed for order {}", order);
}
}
#[test]
fn test_autocorrelation() {
let samples = vec![1, 2, 3, 2, 1];
let r = autocorrelation(&samples, 2);
assert!((r[0] - 19.0).abs() < 1e-10); }
#[test]
fn test_levinson_durbin_basic() {
let r = vec![1.0, 0.5, 0.25];
let (coeffs, _error) = levinson_durbin(&r, 2).unwrap();
assert_eq!(coeffs.len(), 2);
}
#[test]
fn test_quantize_coefficients() {
let coeffs = vec![0.5, -0.25, 0.125];
let (quantized, _shift) = quantize_lpc_coefficients(&coeffs, 12).unwrap();
assert_eq!(quantized.len(), 3);
for q in &quantized {
assert!(q.abs() < (1 << 11)); }
}
#[test]
fn test_lpc_predictor_roundtrip() {
let samples: Vec<i32> = (0..100)
.map(|i| (100.0 * (i as f64 * 0.1).sin()) as i32)
.collect();
let order = 4;
let coeffs = compute_lpc_coefficients(&samples, order).unwrap();
let (qlp_coeffs, qlp_shift) = quantize_lpc_coefficients(&coeffs, 12).unwrap();
let residuals = lpc_predictor_residual(&samples, &qlp_coeffs, qlp_shift).unwrap();
let restored =
lpc_predictor_restore(&samples[..order], &residuals, &qlp_coeffs, qlp_shift).unwrap();
assert_eq!(restored, samples);
}
#[test]
fn test_find_best_fixed_order() {
let linear: Vec<i32> = (0..100).map(|i| i * 10).collect();
let order = find_best_fixed_order(&linear);
assert!(order >= 2, "Linear signal should use order >= 2");
let constant: Vec<i32> = vec![42; 100];
let order = find_best_fixed_order(&constant);
assert!(order >= 1, "Constant signal should use order >= 1");
}
}