use core::arch::x86_64::*;
#[cfg(not(feature = "std"))]
use alloc::vec::Vec;
#[cfg(feature = "std")]
use std::vec::Vec;
use super::shuffle::{
DATA_LEN, DATA_LEN_0124, ENCODE_TABLE_0124, ENCODE_TABLE_CLASSIC, TABLE, TABLE_0124,
};
use crate::error::DecodeError;
#[allow(dead_code)]
#[target_feature(enable = "ssse3")]
pub(super) unsafe fn encode_into_classic(values: &[u32], out: &mut Vec<u8>) {
let n = values.len();
if n == 0 {
return;
}
let ctrl_len = n.div_ceil(4);
let ctrl_start = out.len();
out.reserve(ctrl_len + 4 * n + 16);
out.resize(ctrl_start + ctrl_len, 0u8);
let simd_n = (n / 4) * 4;
let data_start = ctrl_start + ctrl_len;
let base_ptr = out.as_mut_ptr();
let mut data_pos = 0usize;
let bias = _mm_set1_epi32(i32::MIN);
let t1 = _mm_set1_epi32(i32::MIN + 0xFF);
let t2 = _mm_set1_epi32(i32::MIN + 0xFFFF);
let t3 = _mm_set1_epi32(i32::MIN + 0xFF_FFFF);
let zero = _mm_setzero_si128();
let mut block = 0usize;
while block * 4 < simd_n {
let i = block * 4;
let v = unsafe {
_mm_loadu_si128(values.as_ptr().add(i) as *const __m128i)
};
let bv = _mm_add_epi32(v, bias);
let c1 = _mm_cmpgt_epi32(bv, t1);
let c2 = _mm_cmpgt_epi32(bv, t2);
let c3 = _mm_cmpgt_epi32(bv, t3);
let b1 = _mm_sub_epi32(zero, c1);
let b2 = _mm_sub_epi32(zero, c2);
let b3 = _mm_sub_epi32(zero, c3);
let tag_vec = _mm_add_epi32(_mm_add_epi32(b1, b2), b3);
let tag_bytes = _mm_shuffle_epi8(
tag_vec,
_mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0),
);
let tags = _mm_cvtsi128_si32(tag_bytes) as u32;
let ctrl =
((tags & 0x3) | ((tags >> 6) & 0x0C) | ((tags >> 12) & 0x30) | ((tags >> 18) & 0xC0))
as u8;
unsafe {
*base_ptr.add(ctrl_start + block) = ctrl;
let enc_mask =
_mm_loadu_si128(ENCODE_TABLE_CLASSIC[ctrl as usize].as_ptr() as *const __m128i);
let packed = _mm_shuffle_epi8(v, enc_mask);
_mm_storeu_si128(base_ptr.add(data_start + data_pos) as *mut __m128i, packed);
}
data_pos += DATA_LEN[ctrl as usize] as usize;
block += 1;
}
unsafe {
out.set_len(data_start + data_pos);
}
for j in simd_n..n {
let v = values[j];
let (tag, count): (u8, usize) = if v <= 0xFF {
(0, 1)
} else if v <= 0xFFFF {
(1, 2)
} else if v <= 0xFF_FFFF {
(2, 3)
} else {
(3, 4)
};
out[ctrl_start + j / 4] |= tag << ((j % 4) * 2);
out.extend_from_slice(&v.to_le_bytes()[..count]);
}
}
#[allow(dead_code)]
#[target_feature(enable = "ssse3")]
pub(super) unsafe fn encode_into_0124(values: &[u32], out: &mut Vec<u8>) {
let n = values.len();
if n == 0 {
return;
}
let ctrl_len = n.div_ceil(4);
let ctrl_start = out.len();
out.reserve(ctrl_len + 4 * n + 16);
out.resize(ctrl_start + ctrl_len, 0u8);
let simd_n = (n / 4) * 4;
let data_start = ctrl_start + ctrl_len;
let base_ptr = out.as_mut_ptr();
let mut data_pos = 0usize;
let bias = _mm_set1_epi32(i32::MIN);
let t0 = _mm_set1_epi32(i32::MIN); let t1 = _mm_set1_epi32(i32::MIN + 0xFF); let t2 = _mm_set1_epi32(i32::MIN + 0xFFFF); let zero = _mm_setzero_si128();
let mut block = 0usize;
while block * 4 < simd_n {
let i = block * 4;
let v = unsafe {
_mm_loadu_si128(values.as_ptr().add(i) as *const __m128i)
};
let bv = _mm_add_epi32(v, bias);
let c0 = _mm_cmpgt_epi32(bv, t0); let c1 = _mm_cmpgt_epi32(bv, t1); let c2 = _mm_cmpgt_epi32(bv, t2); let b0 = _mm_sub_epi32(zero, c0);
let b1 = _mm_sub_epi32(zero, c1);
let b2 = _mm_sub_epi32(zero, c2);
let tag_vec = _mm_add_epi32(_mm_add_epi32(b0, b1), b2);
let tag_bytes = _mm_shuffle_epi8(
tag_vec,
_mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0),
);
let tags = _mm_cvtsi128_si32(tag_bytes) as u32;
let ctrl =
((tags & 0x3) | ((tags >> 6) & 0x0C) | ((tags >> 12) & 0x30) | ((tags >> 18) & 0xC0))
as u8;
unsafe {
*base_ptr.add(ctrl_start + block) = ctrl;
let enc_mask =
_mm_loadu_si128(ENCODE_TABLE_0124[ctrl as usize].as_ptr() as *const __m128i);
let packed = _mm_shuffle_epi8(v, enc_mask);
_mm_storeu_si128(base_ptr.add(data_start + data_pos) as *mut __m128i, packed);
}
data_pos += DATA_LEN_0124[ctrl as usize] as usize;
block += 1;
}
unsafe {
out.set_len(data_start + data_pos);
}
for j in simd_n..n {
let v = values[j];
let (tag, count): (u8, usize) = if v == 0 {
(0, 0)
} else if v <= 0xFF {
(1, 1)
} else if v <= 0xFFFF {
(2, 2)
} else {
(3, 4)
};
out[ctrl_start + j / 4] |= tag << ((j % 4) * 2);
if count > 0 {
out.extend_from_slice(&v.to_le_bytes()[..count]);
}
}
}
#[allow(dead_code)]
#[target_feature(enable = "ssse3")]
pub(super) unsafe fn decode_into_classic(
data: &[u8],
n: usize,
out: &mut Vec<u32>,
) -> Result<(), DecodeError> {
if n == 0 {
return Ok(());
}
let ctrl_len = n.div_ceil(4);
if data.len() < ctrl_len {
return Err(DecodeError::ControlStreamTooShort {
need: ctrl_len,
have: data.len(),
});
}
let ctrl = &data[..ctrl_len];
let data_bytes = &data[ctrl_len..];
out.reserve(n);
let base = out.len();
let mut ctrl_pos = 0usize;
let mut data_pos = 0usize;
let mut decoded = 0usize;
while decoded + 4 <= n {
let cb = ctrl[ctrl_pos];
if data_pos + 16 > data_bytes.len() {
break;
}
let result = unsafe {
let mask = _mm_loadu_si128(TABLE[cb as usize].as_ptr() as *const __m128i);
let chunk = _mm_loadu_si128(data_bytes.as_ptr().add(data_pos) as *const __m128i);
_mm_shuffle_epi8(chunk, mask)
};
unsafe {
let out_ptr = out.as_mut_ptr().add(base + decoded) as *mut __m128i;
_mm_storeu_si128(out_ptr, result);
}
data_pos += DATA_LEN[cb as usize] as usize;
ctrl_pos += 1;
decoded += 4;
}
unsafe {
out.set_len(base + decoded);
}
if decoded + 4 <= n {
let mut padded = [0u8; 32];
let rem = data_bytes.len() - data_pos;
padded[..rem].copy_from_slice(&data_bytes[data_pos..]);
let mut padded_pos = 0usize;
while decoded + 4 <= n {
let cb = ctrl[ctrl_pos];
let result = unsafe {
let mask = _mm_loadu_si128(TABLE[cb as usize].as_ptr() as *const __m128i);
let chunk = _mm_loadu_si128(padded.as_ptr().add(padded_pos) as *const __m128i);
_mm_shuffle_epi8(chunk, mask)
};
unsafe {
let out_ptr = out.as_mut_ptr().add(base + decoded) as *mut __m128i;
_mm_storeu_si128(out_ptr, result);
}
let consumed = DATA_LEN[cb as usize] as usize;
padded_pos += consumed;
data_pos += consumed;
ctrl_pos += 1;
decoded += 4;
}
unsafe {
out.set_len(base + decoded);
}
}
if decoded < n {
super::scalar::decode_classic_from_raw(
&ctrl[ctrl_pos..],
&data_bytes[data_pos..],
n - decoded,
out,
)?;
}
Ok(())
}
#[allow(dead_code)]
#[target_feature(enable = "ssse3")]
pub(super) unsafe fn decode_into_0124(
data: &[u8],
n: usize,
out: &mut Vec<u32>,
) -> Result<(), DecodeError> {
if n == 0 {
return Ok(());
}
let ctrl_len = n.div_ceil(4);
if data.len() < ctrl_len {
return Err(DecodeError::ControlStreamTooShort {
need: ctrl_len,
have: data.len(),
});
}
let ctrl = &data[..ctrl_len];
let data_bytes = &data[ctrl_len..];
out.reserve(n);
let base = out.len();
let mut ctrl_pos = 0usize;
let mut data_pos = 0usize;
let mut decoded = 0usize;
while decoded + 4 <= n {
let cb = ctrl[ctrl_pos];
if data_pos + 16 > data_bytes.len() {
break;
}
let result = unsafe {
let mask = _mm_loadu_si128(TABLE_0124[cb as usize].as_ptr() as *const __m128i);
let chunk = _mm_loadu_si128(data_bytes.as_ptr().add(data_pos) as *const __m128i);
_mm_shuffle_epi8(chunk, mask)
};
unsafe {
let out_ptr = out.as_mut_ptr().add(base + decoded) as *mut __m128i;
_mm_storeu_si128(out_ptr, result);
}
data_pos += DATA_LEN_0124[cb as usize] as usize;
ctrl_pos += 1;
decoded += 4;
}
unsafe {
out.set_len(base + decoded);
}
if decoded + 4 <= n {
let mut padded = [0u8; 32];
let rem = data_bytes.len() - data_pos;
padded[..rem].copy_from_slice(&data_bytes[data_pos..]);
let mut padded_pos = 0usize;
while decoded + 4 <= n {
let cb = ctrl[ctrl_pos];
let result = unsafe {
let mask = _mm_loadu_si128(TABLE_0124[cb as usize].as_ptr() as *const __m128i);
let chunk = _mm_loadu_si128(padded.as_ptr().add(padded_pos) as *const __m128i);
_mm_shuffle_epi8(chunk, mask)
};
unsafe {
let out_ptr = out.as_mut_ptr().add(base + decoded) as *mut __m128i;
_mm_storeu_si128(out_ptr, result);
}
let consumed = DATA_LEN_0124[cb as usize] as usize;
padded_pos += consumed;
data_pos += consumed;
ctrl_pos += 1;
decoded += 4;
}
unsafe {
out.set_len(base + decoded);
}
}
if decoded < n {
super::scalar::decode_0124_from_raw(
&ctrl[ctrl_pos..],
&data_bytes[data_pos..],
n - decoded,
out,
)?;
}
Ok(())
}