use wide::{u8x16, u32x8, u32x16};
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
#[cfg(target_arch = "x86_64")]
use std::arch::is_x86_feature_detected;
#[cfg(target_arch = "aarch64")]
use std::arch::aarch64::*;
use crate::wide_utils::{
SimdSplit, WideUtilsExt,
SHUFFLE_COMPRESS_IDX_U8_HI, SHUFFLE_COMPRESS_IDX_U8_LO,
};
#[cfg(not(target_arch = "aarch64"))]
use crate::wide_utils::get_compress_indices_u32x8;
#[inline]
pub fn compress_store_u32x8(data: u32x8, mask: u8, dest: &mut [u32]) -> usize {
let count = mask.count_ones() as usize;
assert!(dest.len() >= 8, "destination buffer must have room for 8 elements");
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") {
unsafe { compress_store_u32x8_avx512(data, mask, dest) };
return count;
}
compress_store_u32x8_gather(data, mask, dest);
return count;
}
#[cfg(target_arch = "aarch64")]
{
unsafe { compress_store_u32x8_neon(data, mask, count, dest) };
return count;
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
compress_store_u32x8_gather(data, mask, dest);
count
}
}
#[inline]
pub fn compress_u32x8(data: u32x8, mask: u8) -> (u32x8, usize) {
let count = mask.count_ones() as usize;
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") {
let result = unsafe { compress_u32x8_avx512(data, mask) };
return (result, count);
}
let indices = get_compress_indices_u32x8(mask);
let result = data.shuffle(indices);
return (result, count);
}
#[cfg(target_arch = "aarch64")]
{
let result = unsafe { compress_u32x8_neon_vec(data, mask) };
return (result, count);
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
let indices = get_compress_indices_u32x8(mask);
let result = data.shuffle(indices);
(result, count)
}
}
#[cfg(target_arch = "x86_64")]
#[inline]
#[target_feature(enable = "avx512f", enable = "avx512vl")]
unsafe fn compress_store_u32x8_avx512(data: u32x8, mask: u8, dest: &mut [u32]) {
unsafe {
let raw = std::mem::transmute::<u32x8, __m256i>(data);
_mm256_mask_compressstoreu_epi32(dest.as_mut_ptr() as *mut i32, mask, raw);
}
}
#[cfg(target_arch = "x86_64")]
#[inline]
#[target_feature(enable = "avx512f", enable = "avx512vl")]
unsafe fn compress_u32x8_avx512(data: u32x8, mask: u8) -> u32x8 {
unsafe {
let raw = std::mem::transmute::<u32x8, __m256i>(data);
let compressed = _mm256_maskz_compress_epi32(mask, raw);
std::mem::transmute::<__m256i, u32x8>(compressed)
}
}
#[cfg(target_arch = "aarch64")]
static COMPRESS_BYTE_IDX_U32X8: [(u8x16, u8x16); 256] = {
const fn arr_to_u8x16(arr: [u8; 16]) -> u8x16 {
unsafe { std::mem::transmute(arr) }
}
let mut table: [(u8x16, u8x16); 256] = [(arr_to_u8x16([0u8; 16]), arr_to_u8x16([0u8; 16])); 256];
let mut mask = 0usize;
while mask < 256 {
let mut indices_lo = [0u8; 16];
let mut indices_hi = [0u8; 16];
let mut dest_pos = 0usize;
let mut src_pos = 0usize;
while src_pos < 8 {
if (mask >> src_pos) & 1 != 0 {
let byte_base = (src_pos * 4) as u8;
let dest_base = dest_pos * 4;
if dest_base < 16 {
indices_lo[dest_base] = byte_base;
indices_lo[dest_base + 1] = byte_base + 1;
indices_lo[dest_base + 2] = byte_base + 2;
indices_lo[dest_base + 3] = byte_base + 3;
} else {
let hi_base = dest_base - 16;
indices_hi[hi_base] = byte_base;
indices_hi[hi_base + 1] = byte_base + 1;
indices_hi[hi_base + 2] = byte_base + 2;
indices_hi[hi_base + 3] = byte_base + 3;
}
dest_pos += 1;
}
src_pos += 1;
}
table[mask] = (arr_to_u8x16(indices_lo), arr_to_u8x16(indices_hi));
mask += 1;
}
table
};
#[cfg(target_arch = "aarch64")]
#[inline]
#[target_feature(enable = "neon")]
unsafe fn compress_store_u32x8_neon(data: u32x8, mask: u8, _count: usize, dest: &mut [u32]) {
unsafe {
let (idx_lo, idx_hi) = COMPRESS_BYTE_IDX_U32X8[mask as usize];
let (data_lo, data_hi): (u8x16, u8x16) = std::mem::transmute(data);
let tables = uint8x16x2_t(std::mem::transmute(data_lo), std::mem::transmute(data_hi));
let result_lo = vqtbl2q_u8(tables, std::mem::transmute(idx_lo));
let result_hi = vqtbl2q_u8(tables, std::mem::transmute(idx_hi));
let dest_ptr = dest.as_mut_ptr() as *mut u8;
vst1q_u8(dest_ptr, result_lo);
vst1q_u8(dest_ptr.add(16), result_hi);
}
}
#[cfg(target_arch = "aarch64")]
#[inline]
#[target_feature(enable = "neon")]
unsafe fn compress_u32x8_neon_vec(data: u32x8, mask: u8) -> u32x8 {
unsafe {
let (idx_lo, idx_hi) = COMPRESS_BYTE_IDX_U32X8[mask as usize];
let (data_lo, data_hi): (u8x16, u8x16) = std::mem::transmute(data);
let tables = uint8x16x2_t(std::mem::transmute(data_lo), std::mem::transmute(data_hi));
let result_lo = vqtbl2q_u8(tables, std::mem::transmute(idx_lo));
let result_hi = vqtbl2q_u8(tables, std::mem::transmute(idx_hi));
let lo: u8x16 = std::mem::transmute(result_lo);
let hi: u8x16 = std::mem::transmute(result_hi);
std::mem::transmute((lo, hi))
}
}
#[cfg(not(target_arch = "aarch64"))]
#[inline]
fn compress_store_u32x8_gather(data: u32x8, mask: u8, dest: &mut [u32]) {
let arr = data.to_array();
let mut idx = 0;
if mask & (1 << 0) != 0 { dest[idx] = arr[0]; idx += 1; }
if mask & (1 << 1) != 0 { dest[idx] = arr[1]; idx += 1; }
if mask & (1 << 2) != 0 { dest[idx] = arr[2]; idx += 1; }
if mask & (1 << 3) != 0 { dest[idx] = arr[3]; idx += 1; }
if mask & (1 << 4) != 0 { dest[idx] = arr[4]; idx += 1; }
if mask & (1 << 5) != 0 { dest[idx] = arr[5]; idx += 1; }
if mask & (1 << 6) != 0 { dest[idx] = arr[6]; idx += 1; }
if mask & (1 << 7) != 0 { dest[idx] = arr[7]; }
}
#[inline]
pub fn compress_store_u32x16(data: u32x16, mask: u16, dest: &mut [u32]) -> usize {
let count = mask.count_ones() as usize;
assert!(dest.len() >= 16, "destination buffer must have room for 16 elements");
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx512f") {
unsafe { compress_store_u32x16_avx512(data, mask, dest) };
return count;
}
}
compress_store_u32x16_fallback(data, mask, dest);
count
}
#[inline]
pub fn compress_u32x16(data: u32x16, mask: u16) -> (u32x16, usize) {
let count = mask.count_ones() as usize;
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx512f") {
let result = unsafe { compress_u32x16_avx512(data, mask) };
return (result, count);
}
}
let result = compress_u32x16_fallback_to_vec(data, mask);
(result, count)
}
#[cfg(target_arch = "x86_64")]
#[inline]
#[target_feature(enable = "avx512f")]
unsafe fn compress_store_u32x16_avx512(data: u32x16, mask: u16, dest: &mut [u32]) {
unsafe {
let raw = std::mem::transmute::<u32x16, __m512i>(data);
_mm512_mask_compressstoreu_epi32(dest.as_mut_ptr() as *mut i32, mask, raw);
}
}
#[cfg(target_arch = "x86_64")]
#[inline]
#[target_feature(enable = "avx512f")]
unsafe fn compress_u32x16_avx512(data: u32x16, mask: u16) -> u32x16 {
unsafe {
let raw = std::mem::transmute::<u32x16, __m512i>(data);
let compressed = _mm512_maskz_compress_epi32(mask, raw);
std::mem::transmute::<__m512i, u32x16>(compressed)
}
}
#[inline]
fn compress_store_u32x16_fallback(data: u32x16, mask: u16, dest: &mut [u32]) {
let (lo, hi) = data.split_low_high();
let lo_mask = (mask & 0xFF) as u8;
let hi_mask = ((mask >> 8) & 0xFF) as u8;
let lo_count = compress_store_u32x8(lo, lo_mask, dest);
let _ = compress_store_u32x8(hi, hi_mask, &mut dest[lo_count..]);
}
#[inline]
fn compress_u32x16_fallback_to_vec(data: u32x16, mask: u16) -> u32x16 {
let (lo, hi) = data.split_low_high();
let lo_mask = (mask & 0xFF) as u8;
let hi_mask = ((mask >> 8) & 0xFF) as u8;
let (lo_compressed, lo_count) = compress_u32x8(lo, lo_mask);
let (hi_compressed, hi_count) = compress_u32x8(hi, hi_mask);
let lo_arr = lo_compressed.to_array();
let hi_arr = hi_compressed.to_array();
let mut result = [0u32; 16];
result[..lo_count].copy_from_slice(&lo_arr[..lo_count]);
let hi_copy_count = hi_count.min(16 - lo_count);
result[lo_count..lo_count + hi_copy_count].copy_from_slice(&hi_arr[..hi_copy_count]);
u32x16::from(result)
}
#[inline]
pub fn compress_store_u8x16(data: u8x16, mask: u16, dest: &mut [u8]) -> usize {
let count = mask.count_ones() as usize;
assert!(dest.len() >= 16, "destination buffer must have room for 16 elements");
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx512vbmi2") && is_x86_feature_detected!("avx512vl") {
unsafe { compress_store_u8x16_avx512(data, mask, dest) };
return count;
}
compress_store_u8x16_gather(data, mask, dest);
return count;
}
#[cfg(target_arch = "aarch64")]
{
unsafe { compress_store_u8x16_neon(data, mask, count, dest) };
return count;
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
compress_store_u8x16_gather(data, mask, dest);
count
}
}
#[inline]
pub fn compress_u8x16(data: u8x16, mask: u16) -> (u8x16, usize) {
let count = mask.count_ones() as usize;
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx512vbmi2") && is_x86_feature_detected!("avx512vl") {
let result = unsafe { compress_u8x16_avx512(data, mask) };
return (result, count);
}
}
let result = compress_u8x16_shuffle(data, mask);
(result, count)
}
#[cfg(target_arch = "x86_64")]
#[inline]
#[target_feature(enable = "avx512vbmi2", enable = "avx512vl")]
unsafe fn compress_store_u8x16_avx512(data: u8x16, mask: u16, dest: &mut [u8]) {
unsafe {
let raw = std::mem::transmute::<u8x16, __m128i>(data);
_mm_mask_compressstoreu_epi8(dest.as_mut_ptr() as *mut i8, mask, raw);
}
}
#[cfg(target_arch = "x86_64")]
#[inline]
#[target_feature(enable = "avx512vbmi2", enable = "avx512vl")]
unsafe fn compress_u8x16_avx512(data: u8x16, mask: u16) -> u8x16 {
unsafe {
let raw = std::mem::transmute::<u8x16, __m128i>(data);
let compressed = _mm_maskz_compress_epi8(mask, raw);
std::mem::transmute::<__m128i, u8x16>(compressed)
}
}
#[cfg(target_arch = "aarch64")]
static COMPRESS_BYTE_IDX_U8X16: [u8x16; 65536] = {
const fn arr_to_u8x16(arr: [u8; 16]) -> u8x16 {
unsafe { std::mem::transmute(arr) }
}
let mut table: [u8x16; 65536] = [arr_to_u8x16([0u8; 16]); 65536];
let mut mask = 0usize;
while mask < 65536 {
let mut indices = [0u8; 16];
let mut dest_pos = 0usize;
let mut src_pos = 0usize;
while src_pos < 16 {
if (mask >> src_pos) & 1 != 0 {
indices[dest_pos] = src_pos as u8;
dest_pos += 1;
}
src_pos += 1;
}
table[mask] = arr_to_u8x16(indices);
mask += 1;
}
table
};
#[cfg(target_arch = "aarch64")]
#[inline]
#[target_feature(enable = "neon")]
unsafe fn compress_store_u8x16_neon(data: u8x16, mask: u16, _count: usize, dest: &mut [u8]) {
unsafe {
let data_vec: uint8x16_t = std::mem::transmute(data);
let idx_vec: uint8x16_t = std::mem::transmute(COMPRESS_BYTE_IDX_U8X16[mask as usize]);
let result = vqtbl1q_u8(data_vec, idx_vec);
vst1q_u8(dest.as_mut_ptr(), result);
}
}
#[cfg(not(target_arch = "aarch64"))]
#[inline]
fn compress_store_u8x16_gather(data: u8x16, mask: u16, dest: &mut [u8]) {
let arr = data.to_array();
let mut idx = 0;
if mask & (1 << 0) != 0 { dest[idx] = arr[0]; idx += 1; }
if mask & (1 << 1) != 0 { dest[idx] = arr[1]; idx += 1; }
if mask & (1 << 2) != 0 { dest[idx] = arr[2]; idx += 1; }
if mask & (1 << 3) != 0 { dest[idx] = arr[3]; idx += 1; }
if mask & (1 << 4) != 0 { dest[idx] = arr[4]; idx += 1; }
if mask & (1 << 5) != 0 { dest[idx] = arr[5]; idx += 1; }
if mask & (1 << 6) != 0 { dest[idx] = arr[6]; idx += 1; }
if mask & (1 << 7) != 0 { dest[idx] = arr[7]; idx += 1; }
if mask & (1 << 8) != 0 { dest[idx] = arr[8]; idx += 1; }
if mask & (1 << 9) != 0 { dest[idx] = arr[9]; idx += 1; }
if mask & (1 << 10) != 0 { dest[idx] = arr[10]; idx += 1; }
if mask & (1 << 11) != 0 { dest[idx] = arr[11]; idx += 1; }
if mask & (1 << 12) != 0 { dest[idx] = arr[12]; idx += 1; }
if mask & (1 << 13) != 0 { dest[idx] = arr[13]; idx += 1; }
if mask & (1 << 14) != 0 { dest[idx] = arr[14]; idx += 1; }
if mask & (1 << 15) != 0 { dest[idx] = arr[15]; }
}
#[inline]
fn compress_u8x16_shuffle(data: u8x16, mask: u16) -> u8x16 {
let lo_mask = (mask & 0xFF) as u8;
let hi_mask = ((mask >> 8) & 0xFF) as u8;
let lo_count = lo_mask.count_ones() as usize;
let hi_count = hi_mask.count_ones() as usize;
let lo_indices = &SHUFFLE_COMPRESS_IDX_U8_LO[lo_mask as usize];
let hi_indices = &SHUFFLE_COMPRESS_IDX_U8_HI[hi_mask as usize];
let mut indices = [0u8; 16];
indices[..lo_count].copy_from_slice(&lo_indices[..lo_count]);
let hi_copy_count = hi_count.min(16 - lo_count);
indices[lo_count..lo_count + hi_copy_count].copy_from_slice(&hi_indices[..hi_copy_count]);
data.shuffle(u8x16::from(indices))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_compress_u32x8_basic() {
let data = u32x8::from([10, 20, 30, 40, 50, 60, 70, 80]);
let mask = 0b10110010u8;
let mut output = [0u32; 8];
let count = compress_store_u32x8(data, mask, &mut output);
assert_eq!(count, 4);
assert_eq!(output[0], 20);
assert_eq!(output[1], 50);
assert_eq!(output[2], 60);
assert_eq!(output[3], 80);
}
#[test]
fn test_compress_u32x8_all() {
let data = u32x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
let mask = 0xFFu8;
let mut output = [0u32; 8];
let count = compress_store_u32x8(data, mask, &mut output);
assert_eq!(count, 8);
assert_eq!(output, [1, 2, 3, 4, 5, 6, 7, 8]);
}
#[test]
fn test_compress_u32x8_none() {
let data = u32x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
let mask = 0x00u8;
let mut output = [0u32; 8];
let count = compress_store_u32x8(data, mask, &mut output);
assert_eq!(count, 0);
}
#[test]
fn test_compress_u32x8_first_only() {
let data = u32x8::from([42, 2, 3, 4, 5, 6, 7, 8]);
let mask = 0b00000001u8;
let mut output = [0u32; 8];
let count = compress_store_u32x8(data, mask, &mut output);
assert_eq!(count, 1);
assert_eq!(output[0], 42);
}
#[test]
fn test_compress_u32x8_last_only() {
let data = u32x8::from([1, 2, 3, 4, 5, 6, 7, 99]);
let mask = 0b10000000u8;
let mut output = [0u32; 8];
let count = compress_store_u32x8(data, mask, &mut output);
assert_eq!(count, 1);
assert_eq!(output[0], 99);
}
#[test]
fn test_compress_u8x16_basic() {
let data = u8x16::from([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
let mask = 0b1000000100000101u16;
let mut output = [0u8; 16];
let count = compress_store_u8x16(data, mask, &mut output);
assert_eq!(count, 4);
assert_eq!(output[0], 0);
assert_eq!(output[1], 2);
assert_eq!(output[2], 8);
assert_eq!(output[3], 15);
}
#[test]
fn test_compress_u8x16_all() {
let data = u8x16::from([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
let mask = 0xFFFFu16;
let mut output = [0u8; 16];
let count = compress_store_u8x16(data, mask, &mut output);
assert_eq!(count, 16);
assert_eq!(output, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
}
#[test]
fn test_compress_u8x16_none() {
let data = u8x16::from([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
let mask = 0x0000u16;
let mut output = [0u8; 16];
let count = compress_store_u8x16(data, mask, &mut output);
assert_eq!(count, 0);
}
#[test]
fn test_compress_u8x16_low_half_only() {
let data = u8x16::from([10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160]);
let mask = 0b0000000010101010u16;
let mut output = [0u8; 16];
let count = compress_store_u8x16(data, mask, &mut output);
assert_eq!(count, 4);
assert_eq!(output[0], 20);
assert_eq!(output[1], 40);
assert_eq!(output[2], 60);
assert_eq!(output[3], 80);
}
#[test]
fn test_compress_u8x16_high_half_only() {
let data = u8x16::from([10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160]);
let mask = 0b0101010100000000u16;
let mut output = [0u8; 16];
let count = compress_store_u8x16(data, mask, &mut output);
assert_eq!(count, 4);
assert_eq!(output[0], 90);
assert_eq!(output[1], 110);
assert_eq!(output[2], 130);
assert_eq!(output[3], 150);
}
#[test]
fn test_compress_u32x8_return_vector() {
let data = u32x8::from([10, 20, 30, 40, 50, 60, 70, 80]);
let mask = 0b10110010u8;
let (result, count) = compress_u32x8(data, mask);
let arr = result.to_array();
assert_eq!(count, 4);
assert_eq!(arr[0], 20);
assert_eq!(arr[1], 50);
assert_eq!(arr[2], 60);
assert_eq!(arr[3], 80);
}
#[test]
fn test_compress_u8x16_return_vector() {
let data = u8x16::from([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
let mask = 0b1000000100000101u16;
let (result, count) = compress_u8x16(data, mask);
let arr = result.to_array();
assert_eq!(count, 4);
assert_eq!(arr[0], 0);
assert_eq!(arr[1], 2);
assert_eq!(arr[2], 8);
assert_eq!(arr[3], 15);
}
#[test]
fn test_compress_u32x16_basic() {
let data = u32x16::from([
10, 20, 30, 40, 50, 60, 70, 80,
90, 100, 110, 120, 130, 140, 150, 160
]);
let mask = 0b1000000110110010u16;
let mut output = [0u32; 16];
let count = compress_store_u32x16(data, mask, &mut output);
assert_eq!(count, 6);
assert_eq!(output[0], 20);
assert_eq!(output[1], 50);
assert_eq!(output[2], 60);
assert_eq!(output[3], 80);
assert_eq!(output[4], 90);
assert_eq!(output[5], 160);
}
#[test]
fn test_compress_u32x16_all() {
let data = u32x16::from([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
let mask = 0xFFFFu16;
let mut output = [0u32; 16];
let count = compress_store_u32x16(data, mask, &mut output);
assert_eq!(count, 16);
assert_eq!(output, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
}
#[test]
fn test_compress_u32x16_none() {
let data = u32x16::from([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
let mask = 0x0000u16;
let mut output = [0u32; 16];
let count = compress_store_u32x16(data, mask, &mut output);
assert_eq!(count, 0);
}
#[test]
fn test_compress_u32x16_low_half_only() {
let data = u32x16::from([
10, 20, 30, 40, 50, 60, 70, 80,
90, 100, 110, 120, 130, 140, 150, 160
]);
let mask = 0b0000000001010101u16;
let mut output = [0u32; 16];
let count = compress_store_u32x16(data, mask, &mut output);
assert_eq!(count, 4);
assert_eq!(output[0], 10);
assert_eq!(output[1], 30);
assert_eq!(output[2], 50);
assert_eq!(output[3], 70);
}
#[test]
fn test_compress_u32x16_high_half_only() {
let data = u32x16::from([
10, 20, 30, 40, 50, 60, 70, 80,
90, 100, 110, 120, 130, 140, 150, 160
]);
let mask = 0b0101010100000000u16;
let mut output = [0u32; 16];
let count = compress_store_u32x16(data, mask, &mut output);
assert_eq!(count, 4);
assert_eq!(output[0], 90);
assert_eq!(output[1], 110);
assert_eq!(output[2], 130);
assert_eq!(output[3], 150);
}
#[test]
fn test_compress_u32x16_return_vector() {
let data = u32x16::from([
10, 20, 30, 40, 50, 60, 70, 80,
90, 100, 110, 120, 130, 140, 150, 160
]);
let mask = 0b1000000110110010u16;
let (result, count) = compress_u32x16(data, mask);
let arr = result.to_array();
assert_eq!(count, 6);
assert_eq!(arr[0], 20);
assert_eq!(arr[1], 50);
assert_eq!(arr[2], 60);
assert_eq!(arr[3], 80);
assert_eq!(arr[4], 90);
assert_eq!(arr[5], 160);
}
#[test]
fn test_compress_u32x16_first_and_last() {
let data = u32x16::from([
100, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 200
]);
let mask = 0b1000000000000001u16;
let mut output = [0u32; 16];
let count = compress_store_u32x16(data, mask, &mut output);
assert_eq!(count, 2);
assert_eq!(output[0], 100);
assert_eq!(output[1], 200);
}
#[test]
#[should_panic(expected = "destination buffer must have room for 8 elements")]
fn test_compress_u32x8_panics_on_small_buffer() {
let data = u32x8::from([10, 20, 30, 40, 50, 60, 70, 80]);
let mask = 0b10110010u8;
let mut output = [0u32; 4]; compress_store_u32x8(data, mask, &mut output);
}
#[test]
#[should_panic(expected = "destination buffer must have room for 16 elements")]
fn test_compress_u8x16_panics_on_small_buffer() {
let data = u8x16::from([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
let mask = 0b1000000100000101u16;
let mut output = [0u8; 8]; compress_store_u8x16(data, mask, &mut output);
}
#[test]
#[should_panic(expected = "destination buffer must have room for 16 elements")]
fn test_compress_u32x16_panics_on_small_buffer() {
let data = u32x16::from([
10, 20, 30, 40, 50, 60, 70, 80,
90, 100, 110, 120, 130, 140, 150, 160
]);
let mask = 0b1000000110110010u16;
let mut output = [0u32; 8]; compress_store_u32x16(data, mask, &mut output);
}
}