use core::arch::aarch64::*;
#[cfg(not(feature = "std"))]
use alloc::vec::Vec;
#[cfg(feature = "std")]
use std::vec::Vec;
use super::shuffle::{
DATA_LEN_1234, DATA_LEN_1248_PAIR, ENCODE_TABLE_1234, ENCODE_TABLE_1248_PAIR, TABLE_1234,
TABLE_1248_PAIR,
};
use crate::error::DecodeError;
#[allow(dead_code)]
#[target_feature(enable = "neon")]
pub(super) unsafe fn encode_into_1234(values: &[u64], out: &mut Vec<u8>) {
let n = values.len();
if n == 0 {
return;
}
let ctrl_len = n.div_ceil(4);
let ctrl_start = out.len();
out.reserve(ctrl_len + 4 * n + 16);
out.resize(ctrl_start + ctrl_len, 0u8);
let simd_n = (n / 4) * 4;
let data_start = ctrl_start + ctrl_len;
let base_ptr = out.as_mut_ptr();
let mut data_pos = 0usize;
let weights = unsafe { vld1_u8([1u8, 4, 16, 64, 0, 0, 0, 0].as_ptr()) };
let mut block = 0usize;
while block * 4 < simd_n {
let i = block * 4;
let lo_vals = unsafe {
vld1q_u64(values.as_ptr().add(i))
};
let hi_vals = unsafe {
vld1q_u64(values.as_ptr().add(i + 2))
};
let lo_u32 = vmovn_u64(lo_vals); let hi_u32 = vmovn_u64(hi_vals); let v32 = vcombine_u32(lo_u32, hi_u32);
let gt255 = vcgtq_u32(v32, vdupq_n_u32(0xFF));
let gt65535 = vcgtq_u32(v32, vdupq_n_u32(0xFFFF));
let gt16m = vcgtq_u32(v32, vdupq_n_u32(0xFF_FFFF));
let b1 = vshrq_n_u32::<31>(gt255);
let b2 = vshrq_n_u32::<31>(gt65535);
let b3 = vshrq_n_u32::<31>(gt16m);
let tag_vec = vaddq_u32(vaddq_u32(b1, b2), b3);
let tag16 = vmovn_u32(tag_vec);
let tag8 = vmovn_u16(vcombine_u16(tag16, vdup_n_u16(0)));
let weighted = vmul_u8(tag8, weights);
let ctrl = vaddv_u8(weighted);
unsafe {
*base_ptr.add(ctrl_start + block) = ctrl;
let v_bytes = vreinterpretq_u8_u32(v32);
let mask = vld1q_u8(ENCODE_TABLE_1234[ctrl as usize].as_ptr());
let packed = vqtbl1q_u8(v_bytes, mask);
vst1q_u8(base_ptr.add(data_start + data_pos), packed);
}
data_pos += DATA_LEN_1234[ctrl as usize] as usize;
block += 1;
}
unsafe {
out.set_len(data_start + data_pos);
}
for j in simd_n..n {
let v = values[j];
let (tag, count): (u8, usize) = if v <= 0xFF {
(0, 1)
} else if v <= 0xFFFF {
(1, 2)
} else if v <= 0xFF_FFFF {
(2, 3)
} else {
(3, 4)
};
out[ctrl_start + j / 4] |= tag << ((j % 4) * 2);
out.extend_from_slice(&(v as u32).to_le_bytes()[..count]);
}
}
#[allow(dead_code)]
#[target_feature(enable = "neon")]
pub(super) unsafe fn encode_into_1248(values: &[u64], out: &mut Vec<u8>) {
let n = values.len();
if n == 0 {
return;
}
let ctrl_len = n.div_ceil(4);
let ctrl_start = out.len();
out.reserve(ctrl_len + 8 * n + 16);
out.resize(ctrl_start + ctrl_len, 0u8);
let simd_n = (n / 4) * 4;
let data_start = ctrl_start + ctrl_len;
let base_ptr = out.as_mut_ptr();
let mut data_pos = 0usize;
let tag1248 = |v: u64| -> u8 {
if v <= 0xFF {
0
} else if v <= 0xFFFF {
1
} else if v <= 0xFFFF_FFFF {
2
} else {
3
}
};
let mut block = 0usize;
while block * 4 < simd_n {
let i = block * 4;
let v0 = unsafe { *values.as_ptr().add(i) };
let v1 = unsafe { *values.as_ptr().add(i + 1) };
let v2 = unsafe { *values.as_ptr().add(i + 2) };
let v3 = unsafe { *values.as_ptr().add(i + 3) };
let t0 = tag1248(v0);
let t1 = tag1248(v1);
let t2 = tag1248(v2);
let t3 = tag1248(v3);
let ctrl = t0 | (t1 << 2) | (t2 << 4) | (t3 << 6);
let lo_key = (ctrl & 0x0F) as usize;
let hi_key = (ctrl >> 4) as usize;
unsafe {
*base_ptr.add(ctrl_start + block) = ctrl;
let pair_lo = vld1q_u8(values.as_ptr().add(i) as *const u8);
let mask_lo = vld1q_u8(ENCODE_TABLE_1248_PAIR[lo_key].as_ptr());
let packed_lo = vqtbl1q_u8(pair_lo, mask_lo);
vst1q_u8(base_ptr.add(data_start + data_pos), packed_lo);
data_pos += DATA_LEN_1248_PAIR[lo_key] as usize;
let pair_hi = vld1q_u8(values.as_ptr().add(i + 2) as *const u8);
let mask_hi = vld1q_u8(ENCODE_TABLE_1248_PAIR[hi_key].as_ptr());
let packed_hi = vqtbl1q_u8(pair_hi, mask_hi);
vst1q_u8(base_ptr.add(data_start + data_pos), packed_hi);
data_pos += DATA_LEN_1248_PAIR[hi_key] as usize;
}
block += 1;
}
unsafe {
out.set_len(data_start + data_pos);
}
for j in simd_n..n {
let v = values[j];
let (tag, count): (u8, usize) = if v <= 0xFF {
(0, 1)
} else if v <= 0xFFFF {
(1, 2)
} else if v <= 0xFFFF_FFFF {
(2, 4)
} else {
(3, 8)
};
out[ctrl_start + j / 4] |= tag << ((j % 4) * 2);
out.extend_from_slice(&v.to_le_bytes()[..count]);
}
}
#[allow(dead_code)]
#[target_feature(enable = "neon")]
pub(super) unsafe fn decode_into_1234(
data: &[u8],
n: usize,
out: &mut Vec<u64>,
) -> Result<(), DecodeError> {
if n == 0 {
return Ok(());
}
let ctrl_len = n.div_ceil(4);
if data.len() < ctrl_len {
return Err(DecodeError::ControlStreamTooShort {
need: ctrl_len,
have: data.len(),
});
}
let ctrl = &data[..ctrl_len];
let data_bytes = &data[ctrl_len..];
out.reserve(n);
let base = out.len();
let mut ctrl_pos = 0usize;
let mut data_pos = 0usize;
let mut decoded = 0usize;
while decoded + 4 <= n {
let cb = ctrl[ctrl_pos];
if data_pos + 16 > data_bytes.len() {
break;
}
let u32s = unsafe {
let mask = vld1q_u8(TABLE_1234[cb as usize].as_ptr());
let chunk = vld1q_u8(data_bytes.as_ptr().add(data_pos));
vreinterpretq_u32_u8(vqtbl1q_u8(chunk, mask))
};
let lo = vmovl_u32(vget_low_u32(u32s)); let hi = vmovl_high_u32(u32s);
unsafe {
let out_ptr = out.as_mut_ptr().add(base + decoded);
vst1q_u64(out_ptr, lo);
vst1q_u64(out_ptr.add(2), hi);
}
data_pos += DATA_LEN_1234[cb as usize] as usize;
ctrl_pos += 1;
decoded += 4;
}
unsafe {
out.set_len(base + decoded);
}
if decoded + 4 <= n {
let mut padded = [0u8; 32];
let rem = data_bytes.len() - data_pos;
padded[..rem].copy_from_slice(&data_bytes[data_pos..]);
let mut padded_pos = 0usize;
while decoded + 4 <= n {
let cb = ctrl[ctrl_pos];
let u32s = unsafe {
let mask = vld1q_u8(TABLE_1234[cb as usize].as_ptr());
let chunk = vld1q_u8(padded.as_ptr().add(padded_pos));
vreinterpretq_u32_u8(vqtbl1q_u8(chunk, mask))
};
let lo = vmovl_u32(vget_low_u32(u32s));
let hi = vmovl_high_u32(u32s);
unsafe {
let out_ptr = out.as_mut_ptr().add(base + decoded);
vst1q_u64(out_ptr, lo);
vst1q_u64(out_ptr.add(2), hi);
}
let consumed = DATA_LEN_1234[cb as usize] as usize;
padded_pos += consumed;
data_pos += consumed;
ctrl_pos += 1;
decoded += 4;
}
unsafe {
out.set_len(base + decoded);
}
}
if decoded < n {
super::scalar::decode_1234_from_raw(
&ctrl[ctrl_pos..],
&data_bytes[data_pos..],
n - decoded,
out,
)?;
}
Ok(())
}
#[allow(dead_code)]
#[target_feature(enable = "neon")]
pub(super) unsafe fn decode_into_1248(
data: &[u8],
n: usize,
out: &mut Vec<u64>,
) -> Result<(), DecodeError> {
if n == 0 {
return Ok(());
}
let ctrl_len = n.div_ceil(4);
if data.len() < ctrl_len {
return Err(DecodeError::ControlStreamTooShort {
need: ctrl_len,
have: data.len(),
});
}
let ctrl = &data[..ctrl_len];
let data_bytes = &data[ctrl_len..];
out.reserve(n);
let base = out.len();
let mut ctrl_pos = 0usize;
let mut data_pos = 0usize;
let mut decoded = 0usize;
while decoded + 4 <= n {
let cb = ctrl[ctrl_pos];
let lo_key = (cb & 0x0F) as usize;
let hi_key = (cb >> 4) as usize;
let lo_bytes = DATA_LEN_1248_PAIR[lo_key] as usize;
if data_pos + lo_bytes + 16 > data_bytes.len() {
break;
}
let (lo_pair, hi_pair) = unsafe {
let mask_lo = vld1q_u8(TABLE_1248_PAIR[lo_key].as_ptr());
let chunk_lo = vld1q_u8(data_bytes.as_ptr().add(data_pos));
let lo = vqtbl1q_u8(chunk_lo, mask_lo);
let mask_hi = vld1q_u8(TABLE_1248_PAIR[hi_key].as_ptr());
let chunk_hi = vld1q_u8(data_bytes.as_ptr().add(data_pos + lo_bytes));
let hi = vqtbl1q_u8(chunk_hi, mask_hi);
(lo, hi)
};
unsafe {
let out_ptr = out.as_mut_ptr().add(base + decoded) as *mut u8;
vst1q_u8(out_ptr, lo_pair);
vst1q_u8(out_ptr.add(16), hi_pair);
}
data_pos += lo_bytes + DATA_LEN_1248_PAIR[hi_key] as usize;
ctrl_pos += 1;
decoded += 4;
}
unsafe {
out.set_len(base + decoded);
}
if decoded + 4 <= n {
let mut padded = [0u8; 64];
let rem = data_bytes.len() - data_pos;
padded[..rem].copy_from_slice(&data_bytes[data_pos..]);
let mut padded_pos = 0usize;
while decoded + 4 <= n {
let cb = ctrl[ctrl_pos];
let lo_key = (cb & 0x0F) as usize;
let hi_key = (cb >> 4) as usize;
let lo_bytes = DATA_LEN_1248_PAIR[lo_key] as usize;
let (lo_pair, hi_pair) = unsafe {
let mask_lo = vld1q_u8(TABLE_1248_PAIR[lo_key].as_ptr());
let chunk_lo = vld1q_u8(padded.as_ptr().add(padded_pos));
let lo = vqtbl1q_u8(chunk_lo, mask_lo);
let mask_hi = vld1q_u8(TABLE_1248_PAIR[hi_key].as_ptr());
let chunk_hi = vld1q_u8(padded.as_ptr().add(padded_pos + lo_bytes));
let hi = vqtbl1q_u8(chunk_hi, mask_hi);
(lo, hi)
};
unsafe {
let out_ptr = out.as_mut_ptr().add(base + decoded) as *mut u8;
vst1q_u8(out_ptr, lo_pair);
vst1q_u8(out_ptr.add(16), hi_pair);
}
let consumed = lo_bytes + DATA_LEN_1248_PAIR[hi_key] as usize;
padded_pos += consumed;
data_pos += consumed;
ctrl_pos += 1;
decoded += 4;
}
unsafe {
out.set_len(base + decoded);
}
}
if decoded < n {
super::scalar::decode_1248_from_raw(
&ctrl[ctrl_pos..],
&data_bytes[data_pos..],
n - decoded,
out,
)?;
}
Ok(())
}