use std::cmp::Ordering;
use ndarray::ArrayView2;
use rayon::prelude::*;
use statrs::distribution::{Beta, ContinuousCDF};
const TQPLUS_P_LO: f64 = 0.05;
const TQPLUS_P_HI: f64 = 0.95;
const TQPLUS_MIN_SAMPLES: usize = 1000;
pub fn encode(
vectors: &[f32],
n: usize,
dim: usize,
rotation: &[f32],
boundaries: &[f32],
centroids: &[f32],
bit_width: usize,
existing_calibration: Option<(&[f32], &[f32])>,
) -> (Vec<u8>, Vec<f32>, Vec<f32>, Vec<f32>) {
let mut norms = vec![0.0f32; n];
let mut unit_flat = vec![0.0f32; n * dim];
norms.par_iter_mut()
.zip(unit_flat.par_chunks_mut(dim))
.enumerate()
.for_each(|(i, (norm, unit_row))| {
let row = &vectors[i * dim..(i + 1) * dim];
let n_val = simd_norm(row);
*norm = n_val;
let inv = if n_val > 1e-10 { 1.0 / n_val } else { 0.0 };
simd_scale(row, inv, unit_row);
});
let unit_mat = ArrayView2::from_shape((n, dim), &unit_flat).unwrap();
let rot_mat = ArrayView2::from_shape((dim, dim), rotation).unwrap();
let rotated_mat = unit_mat.dot(&rot_mat.t());
let rotated = rotated_mat.as_slice().unwrap();
let (shift, scale_tq) = match existing_calibration {
Some((s, sc)) => {
assert_eq!(s.len(), dim, "existing shift length must equal dim");
assert_eq!(sc.len(), dim, "existing scale_tq length must equal dim");
(s.to_vec(), sc.to_vec())
}
None => compute_tqplus_calibration(rotated, n, dim),
};
let mut rotated_calib = vec![0.0f32; n * dim];
rotated_calib.par_chunks_mut(dim).enumerate().for_each(|(i, calib_row)| {
let orig_row = &rotated[i * dim..(i + 1) * dim];
for d in 0..dim {
calib_row[d] = (orig_row[d] + shift[d]) * scale_tq[d];
}
});
let inv_scale_tq: Vec<f32> = scale_tq.iter().map(|s| 1.0 / s).collect();
let bytes_per_plane = dim / 8;
let bytes_per_row = bit_width * bytes_per_plane;
let mut packed = vec![0u8; n * bytes_per_row];
let mut scales = vec![0.0f32; n];
packed.par_chunks_mut(bytes_per_row)
.zip(scales.par_iter_mut())
.enumerate()
.for_each(|(i, (packed_row, scale))| {
let rot_orig = &rotated[i * dim..(i + 1) * dim];
let rot_calib = &rotated_calib[i * dim..(i + 1) * dim];
*scale = fused_quantize_scale_pack(
rot_orig, rot_calib, &shift, &inv_scale_tq,
boundaries, centroids, norms[i],
packed_row, dim, bit_width, bytes_per_plane,
);
});
(packed, scales, shift, scale_tq)
}
fn compute_tqplus_calibration(
rotated: &[f32],
n: usize,
dim: usize,
) -> (Vec<f32>, Vec<f32>) {
let mut shift = vec![0.0f32; dim];
let mut scale = vec![1.0f32; dim];
if n < TQPLUS_MIN_SAMPLES {
return (shift, scale);
}
let a = (dim as f64 - 1.0) / 2.0;
let beta = Beta::new(a, a).expect("Beta(a, a) is valid for a > 0");
let qc_lo = (2.0 * beta.inverse_cdf(TQPLUS_P_LO) - 1.0) as f32;
let qc_hi = (2.0 * beta.inverse_cdf(TQPLUS_P_HI) - 1.0) as f32;
let qc_span = qc_hi - qc_lo;
let lo_idx = ((n as f64) * TQPLUS_P_LO) as usize;
let hi_idx = (((n as f64) * TQPLUS_P_HI) as usize).min(n - 1);
shift.par_iter_mut().zip(scale.par_iter_mut()).enumerate().for_each(
|(d, (sh, sc))| {
let mut coord: Vec<f32> = (0..n).map(|i| rotated[i * dim + d]).collect();
coord.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
let qe_lo = coord[lo_idx];
let qe_hi = coord[hi_idx];
let qe_span = qe_hi - qe_lo;
if qe_span > 1e-6 {
*sc = qc_span / qe_span;
*sh = qc_lo / *sc - qe_lo;
}
},
);
(shift, scale)
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
fn simd_norm(row: &[f32]) -> f32 {
use std::arch::aarch64::*;
let dim = row.len();
let chunks = dim / 4;
let mut acc = unsafe { vdupq_n_f32(0.0) };
unsafe {
for c in 0..chunks {
let v = vld1q_f32(row.as_ptr().add(c * 4));
acc = vfmaq_f32(acc, v, v);
}
let mut sum = vaddvq_f32(acc);
for j in (chunks * 4)..dim {
sum += row[j] * row[j];
}
sum.sqrt()
}
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
fn simd_scale(row: &[f32], scale: f32, out: &mut [f32]) {
use std::arch::aarch64::*;
let dim = row.len();
let chunks = dim / 4;
let sv = unsafe { vdupq_n_f32(scale) };
unsafe {
for c in 0..chunks {
let v = vld1q_f32(row.as_ptr().add(c * 4));
vst1q_f32(out.as_mut_ptr().add(c * 4), vmulq_f32(v, sv));
}
for j in (chunks * 4)..dim {
out[j] = row[j] * scale;
}
}
}
#[cfg(not(target_arch = "aarch64"))]
#[inline(always)]
fn simd_norm(row: &[f32]) -> f32 {
row.iter().map(|x| x * x).sum::<f32>().sqrt()
}
#[cfg(not(target_arch = "aarch64"))]
#[inline(always)]
fn simd_scale(row: &[f32], scale: f32, out: &mut [f32]) {
for j in 0..row.len() {
out[j] = row[j] * scale;
}
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
fn fused_quantize_scale_pack(
rot_orig: &[f32],
rot_calib: &[f32],
shift: &[f32],
inv_scale_tq: &[f32],
boundaries: &[f32],
centroids: &[f32],
norm: f32,
packed_row: &mut [u8],
dim: usize,
bits: usize,
bytes_per_plane: usize,
) -> f32 {
use std::arch::aarch64::*;
let mut inner = 0.0f64;
let chunks = dim / 8;
unsafe {
for c in 0..chunks {
let offset = c * 8;
let vals_lo = vld1q_f32(rot_calib.as_ptr().add(offset));
let vals_hi = vld1q_f32(rot_calib.as_ptr().add(offset + 4));
let mut acc_lo = vdupq_n_u32(0);
let mut acc_hi = vdupq_n_u32(0);
for &b in boundaries {
let bv = vdupq_n_f32(b);
acc_lo = vaddq_u32(acc_lo, vshrq_n_u32::<31>(vcgtq_f32(vals_lo, bv)));
acc_hi = vaddq_u32(acc_hi, vshrq_n_u32::<31>(vcgtq_f32(vals_hi, bv)));
}
let counts: [u8; 8] = [
vgetq_lane_u32::<0>(acc_lo) as u8,
vgetq_lane_u32::<1>(acc_lo) as u8,
vgetq_lane_u32::<2>(acc_lo) as u8,
vgetq_lane_u32::<3>(acc_lo) as u8,
vgetq_lane_u32::<0>(acc_hi) as u8,
vgetq_lane_u32::<1>(acc_hi) as u8,
vgetq_lane_u32::<2>(acc_hi) as u8,
vgetq_lane_u32::<3>(acc_hi) as u8,
];
for k in 0..8 {
let d = offset + k;
let centroid_in_orig =
(centroids[counts[k] as usize] as f64) * (inv_scale_tq[d] as f64)
- (shift[d] as f64);
inner += (rot_orig[d] as f64) * centroid_in_orig;
}
let codes_vec = vld1_u8(counts.as_ptr());
let weights: [u8; 8] = [128, 64, 32, 16, 8, 4, 2, 1];
let wv = vld1_u8(weights.as_ptr());
for p in 0..bits {
let mask = vdup_n_u8(1u8 << p);
let hit = vcgt_u8(vand_u8(codes_vec, mask), vdup_n_u8(0));
packed_row[p * bytes_per_plane + offset / 8] = vaddv_u8(vand_u8(hit, wv));
}
}
for j in (chunks * 8)..dim {
let mut code = 0u8;
for &b in boundaries {
if rot_calib[j] > b { code += 1; }
}
let centroid_in_orig =
(centroids[code as usize] as f64) * (inv_scale_tq[j] as f64)
- (shift[j] as f64);
inner += (rot_orig[j] as f64) * centroid_in_orig;
let byte_pos = j / 8;
let bit_pos = 7 - (j % 8);
for p in 0..bits {
if code & (1 << p) != 0 {
packed_row[p * bytes_per_plane + byte_pos] |= 1 << bit_pos;
}
}
}
}
let inner = inner.max(1e-10) as f32;
norm / inner
}
#[cfg(not(target_arch = "aarch64"))]
#[inline(always)]
fn fused_quantize_scale_pack(
rot_orig: &[f32],
rot_calib: &[f32],
shift: &[f32],
inv_scale_tq: &[f32],
boundaries: &[f32],
centroids: &[f32],
norm: f32,
packed_row: &mut [u8],
dim: usize,
bits: usize,
bytes_per_plane: usize,
) -> f32 {
let mut inner = 0.0f64;
for j in 0..dim {
let mut code = 0u8;
for &b in boundaries {
if rot_calib[j] > b { code += 1; }
}
let centroid_in_orig =
(centroids[code as usize] as f64) * (inv_scale_tq[j] as f64)
- (shift[j] as f64);
inner += (rot_orig[j] as f64) * centroid_in_orig;
let byte_pos = j / 8;
let bit_pos = 7 - (j % 8);
for p in 0..bits {
if code & (1 << p) != 0 {
packed_row[p * bytes_per_plane + byte_pos] |= 1 << bit_pos;
}
}
}
let inner = inner.max(1e-10) as f32;
norm / inner
}