#[cfg(target_arch = "aarch64")]
use std::arch::aarch64::*;
#[cfg(target_arch = "aarch64")]
pub fn unpack_neon(data: &[u8], bit_width: u8, num_values: usize, out: &mut Vec<u32>) {
let mask_val = if bit_width >= 32 {
u32::MAX
} else {
(1u32 << bit_width) - 1
};
unsafe {
let mask = vdupq_n_u32(mask_val);
let bw = bit_width as u64;
let mut bit_pos = 0u64;
let chunks = num_values / 4;
let remainder = num_values % 4;
for _ in 0..chunks {
let mut vals = [0u32; 4];
for (j, val) in vals.iter_mut().enumerate() {
let bp = bit_pos + j as u64 * bw;
let byte_idx = (bp / 8) as usize;
let bit_offset = (bp % 8) as u32;
let mut wide_bytes = [0u8; 8];
let avail = data.len().saturating_sub(byte_idx).min(8);
wide_bytes[..avail].copy_from_slice(&data[byte_idx..byte_idx + avail]);
let wide = u64::from_le_bytes(wide_bytes);
*val = ((wide >> bit_offset) as u32) & mask_val;
}
let v = vld1q_u32(vals.as_ptr());
let masked = vandq_u32(v, mask);
vst1q_u32(out.as_mut_ptr().add(out.len()), masked);
out.set_len(out.len() + 4);
bit_pos += 4 * bw;
}
for _ in 0..remainder {
let byte_idx = (bit_pos / 8) as usize;
let bit_offset = (bit_pos % 8) as u32;
let mut wide_bytes = [0u8; 8];
let avail = data.len().saturating_sub(byte_idx).min(8);
wide_bytes[..avail].copy_from_slice(&data[byte_idx..byte_idx + avail]);
let wide = u64::from_le_bytes(wide_bytes);
let val = ((wide >> bit_offset) as u32) & mask_val;
out.push(val);
bit_pos += bw;
}
}
}
#[cfg(not(target_arch = "aarch64"))]
pub fn unpack_neon(_data: &[u8], _bit_width: u8, _num_values: usize, _out: &mut Vec<u32>) {
unreachable!("NEON unpack called on non-AArch64 platform");
}