simd_lookup/
simd_compress.rs

1//! SIMD compress operations
2//!
3//! This module provides compress/compact operations similar to AVX-512's VCOMPRESS instruction.
4//! Elements where the corresponding mask bit is set are packed contiguously to the front of
5//! the destination buffer.
6//!
7//! # CPU Feature Requirements
8//!
9//! ## Intel x86_64 - Optimal Performance (AVX-512)
10//!
11//! - **`compress_store_u32x8` / `compress_u32x8`**: Requires **AVX512F** + **AVX512VL**
12//!   - Uses `VPCOMPRESSD` instruction (`_mm256_mask_compressstoreu_epi32`)
13//!   - Available on: Intel Skylake-X (Xeon), Ice Lake, Tiger Lake, and later
14//!   - Fallback: Shuffle-based table lookup (works on all architectures)
15//!
16//! - **`compress_store_u32x16` / `compress_u32x16`**: Requires **AVX512F**
17//!   - Uses `VPCOMPRESSD` instruction (`_mm512_mask_compressstoreu_epi32`)
18//!   - Available on: Intel Skylake-X (Xeon), Ice Lake, Tiger Lake, and later
19//!   - Fallback: Two `compress_store_u32x8` operations (works on all architectures)
20//!
21//! - **`compress_store_u8x16` / `compress_u8x16`**: Requires **AVX512VBMI2** + **AVX512VL**
22//!   - Uses `VPCOMPRESSB` instruction (`_mm256_mask_compressstoreu_epi8`)
23//!   - Available on: Intel Ice Lake, Tiger Lake, and later (not available on Skylake-X)
24//!   - Fallback: NEON TBL shuffle on ARM, gather-style writes elsewhere
25//!
26//! ## ARM aarch64 - NEON Optimizations (Apple Silicon M1/M2/M3)
27//!
28//! On ARM processors, this module uses NEON-optimized implementations:
29//!
30//! - **`compress_store_u8x16`**: Uses NEON `TBL` instruction via shuffle + copy
31//!   - Eliminates 16 conditional branches from the scalar fallback
32//!   - Uses precomputed shuffle index tables for O(1) index lookup
33//!
34//! - **`compress_store_u32x8`**: Uses NEON `TBL` with byte-level shuffle indices
35//!   - Uses `vqtbl1q_u8` for efficient 16-byte permutation (processes as 2 halves)
36//!   - Precomputed byte-index table avoids runtime index conversion overhead
37//!
38//! - **Bitmask expansion**: Uses NEON parallel bit operations
39//!   - Converts bitmask to vector mask without scalar loops
40//!
41//! ## Fallback Behavior
42//!
43//! All functions automatically fall back to scalar/shuffle implementations when
44//! architecture-specific features are not available:
45//! - x86_64 without AVX-512 (uses AVX2/SSE if available, or scalar)
46//! - aarch64 without NEON (rare, uses scalar)
47//! - All other architectures (scalar fallback)
48//!
49//! ## Performance Impact
50//!
51//! - AVX-512 compress instructions are **3-5× faster** than shuffle-based fallbacks
52//! - ARM NEON shuffle-based compress is **~2× faster** than scalar conditional branches
53//!   for typical mask densities (10-50% of elements selected)
54//!
55//! # Example
56//! ```ignore
57//! use wide::u32x8;
58//! use simd_lookup::simd_compress::compress_store_u32x8;
59//!
60//! let data = u32x8::from([10, 20, 30, 40, 50, 60, 70, 80]);
61//! let mask = 0b10110010u8; // Select elements at positions 1, 4, 5, 7
62//! let mut output = [0u32; 8];
63//!
64//! let count = compress_store_u32x8(data, mask, &mut output);
65//! // count == 4
66//! // output[0..4] == [20, 50, 60, 80]
67//! ```
68
69use wide::{u8x16, u32x8, u32x16};
70
71#[cfg(target_arch = "x86_64")]
72use std::arch::x86_64::*;
73
74#[cfg(target_arch = "x86_64")]
75use std::arch::is_x86_feature_detected;
76
77#[cfg(target_arch = "aarch64")]
78use std::arch::aarch64::*;
79
80use crate::wide_utils::{
81    SimdSplit, WideUtilsExt,
82    SHUFFLE_COMPRESS_IDX_U8_HI, SHUFFLE_COMPRESS_IDX_U8_LO,
83};
84
85#[cfg(not(target_arch = "aarch64"))]
86use crate::wide_utils::get_compress_indices_u32x8;
87
88// =============================================================================
89// u32x8 Compress Operations
90// =============================================================================
91
92/// Compress and store u32x8 elements where mask bits are set.
93///
94/// # Arguments
95/// * `data` - Source vector of 8 u32 values
96/// * `mask` - 8-bit mask where bit i selects element i
97/// * `dest` - Destination slice (**must have room for 8 elements**)
98///
99/// # Returns
100/// Number of elements written (equal to `mask.count_ones()`)
101///
102/// # Panics
103/// Panics if `dest.len() < 8`. The destination must have room for the full
104/// uncompressed vector since the mask is not known at compile time.
105#[inline]
106pub fn compress_store_u32x8(data: u32x8, mask: u8, dest: &mut [u32]) -> usize {
107    let count = mask.count_ones() as usize;
108    assert!(dest.len() >= 8, "destination buffer must have room for 8 elements");
109
110    #[cfg(target_arch = "x86_64")]
111    {
112        if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") {
113            unsafe { compress_store_u32x8_avx512(data, mask, dest) };
114            return count;
115        }
116        // Fallback for x86-64 without AVX-512
117        compress_store_u32x8_gather(data, mask, dest);
118        return count;
119    }
120
121    #[cfg(target_arch = "aarch64")]
122    {
123        // Use NEON TBL-based shuffle - faster than conditional branches
124        unsafe { compress_store_u32x8_neon(data, mask, count, dest) };
125        return count;
126    }
127
128    // This should never be reached - all architectures have been handled above
129    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
130    {
131        compress_store_u32x8_gather(data, mask, dest);
132        count
133    }
134}
135
136/// Compress u32x8 and return both the compressed vector and element count.
137/// Unwritten lanes contain undefined values.
138#[inline]
139pub fn compress_u32x8(data: u32x8, mask: u8) -> (u32x8, usize) {
140    let count = mask.count_ones() as usize;
141
142    #[cfg(target_arch = "x86_64")]
143    {
144        if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") {
145            let result = unsafe { compress_u32x8_avx512(data, mask) };
146            return (result, count);
147        }
148        // Fallback for x86-64 without AVX-512
149        let indices = get_compress_indices_u32x8(mask);
150        let result = data.shuffle(indices);
151        return (result, count);
152    }
153
154    #[cfg(target_arch = "aarch64")]
155    {
156        // Use NEON TBL-based shuffle with byte-level indices
157        let result = unsafe { compress_u32x8_neon_vec(data, mask) };
158        return (result, count);
159    }
160
161    // Fallback: use shuffle with SIMD indices (zero-cost table lookup via transmute)
162    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
163    {
164        let indices = get_compress_indices_u32x8(mask);
165        let result = data.shuffle(indices);
166        (result, count)
167    }
168}
169
170#[cfg(target_arch = "x86_64")]
171#[inline]
172#[target_feature(enable = "avx512f", enable = "avx512vl")]
173unsafe fn compress_store_u32x8_avx512(data: u32x8, mask: u8, dest: &mut [u32]) {
174    unsafe {
175        let raw = std::mem::transmute::<u32x8, __m256i>(data);
176        _mm256_mask_compressstoreu_epi32(dest.as_mut_ptr() as *mut i32, mask, raw);
177    }
178}
179
180#[cfg(target_arch = "x86_64")]
181#[inline]
182#[target_feature(enable = "avx512f", enable = "avx512vl")]
183unsafe fn compress_u32x8_avx512(data: u32x8, mask: u8) -> u32x8 {
184    unsafe {
185        let raw = std::mem::transmute::<u32x8, __m256i>(data);
186        let compressed = _mm256_maskz_compress_epi32(mask, raw);
187        std::mem::transmute::<__m256i, u32x8>(compressed)
188    }
189}
190
191// =============================================================================
192// ARM NEON u32x8 Compress Implementations
193// =============================================================================
194
195/// Byte-level shuffle indices for NEON TBL-based u32x8 compress.
196/// Stored as (u8x16, u8x16) pairs for proper alignment and zero-cost transmute.
197/// Each entry contains byte indices (0-31) that shuffle selected u32 elements to the front.
198#[cfg(target_arch = "aarch64")]
199static COMPRESS_BYTE_IDX_U32X8: [(u8x16, u8x16); 256] = {
200    // Safety: u8x16 is repr(transparent) over [u8; 16], so transmute is safe
201    const fn arr_to_u8x16(arr: [u8; 16]) -> u8x16 {
202        unsafe { std::mem::transmute(arr) }
203    }
204
205    let mut table: [(u8x16, u8x16); 256] = [(arr_to_u8x16([0u8; 16]), arr_to_u8x16([0u8; 16])); 256];
206    let mut mask = 0usize;
207    while mask < 256 {
208        let mut indices_lo = [0u8; 16];
209        let mut indices_hi = [0u8; 16];
210        let mut dest_pos = 0usize;
211        let mut src_pos = 0usize;
212        while src_pos < 8 {
213            if (mask >> src_pos) & 1 != 0 {
214                // Each u32 element is 4 bytes
215                let byte_base = (src_pos * 4) as u8;
216                let dest_base = dest_pos * 4;
217                if dest_base < 16 {
218                    indices_lo[dest_base] = byte_base;
219                    indices_lo[dest_base + 1] = byte_base + 1;
220                    indices_lo[dest_base + 2] = byte_base + 2;
221                    indices_lo[dest_base + 3] = byte_base + 3;
222                } else {
223                    let hi_base = dest_base - 16;
224                    indices_hi[hi_base] = byte_base;
225                    indices_hi[hi_base + 1] = byte_base + 1;
226                    indices_hi[hi_base + 2] = byte_base + 2;
227                    indices_hi[hi_base + 3] = byte_base + 3;
228                }
229                dest_pos += 1;
230            }
231            src_pos += 1;
232        }
233        table[mask] = (arr_to_u8x16(indices_lo), arr_to_u8x16(indices_hi));
234        mask += 1;
235    }
236    table
237};
238
239/// NEON-optimized compress store for u32x8 using TBL instruction.
240/// Writes full 32 bytes directly; only first `count` elements are valid.
241/// Caller must ensure dest has room for 8 elements.
242#[cfg(target_arch = "aarch64")]
243#[inline]
244#[target_feature(enable = "neon")]
245unsafe fn compress_store_u32x8_neon(data: u32x8, mask: u8, _count: usize, dest: &mut [u32]) {
246    unsafe {
247        let (idx_lo, idx_hi) = COMPRESS_BYTE_IDX_U32X8[mask as usize];
248
249        // Zero-cost transmute u32x8 to (u8x16, u8x16) for TBL2
250        let (data_lo, data_hi): (u8x16, u8x16) = std::mem::transmute(data);
251        let tables = uint8x16x2_t(std::mem::transmute(data_lo), std::mem::transmute(data_hi));
252
253        // TBL2 shuffle both halves
254        let result_lo = vqtbl2q_u8(tables, std::mem::transmute(idx_lo));
255        let result_hi = vqtbl2q_u8(tables, std::mem::transmute(idx_hi));
256
257        // Store full 32 bytes directly
258        let dest_ptr = dest.as_mut_ptr() as *mut u8;
259        vst1q_u8(dest_ptr, result_lo);
260        vst1q_u8(dest_ptr.add(16), result_hi);
261    }
262}
263
264/// NEON-optimized compress for u32x8 returning a vector.
265#[cfg(target_arch = "aarch64")]
266#[inline]
267#[target_feature(enable = "neon")]
268unsafe fn compress_u32x8_neon_vec(data: u32x8, mask: u8) -> u32x8 {
269    unsafe {
270        let (idx_lo, idx_hi) = COMPRESS_BYTE_IDX_U32X8[mask as usize];
271
272        // Zero-cost transmute u32x8 to (u8x16, u8x16) for TBL2
273        let (data_lo, data_hi): (u8x16, u8x16) = std::mem::transmute(data);
274        let tables = uint8x16x2_t(std::mem::transmute(data_lo), std::mem::transmute(data_hi));
275
276        // Shuffle both halves
277        let result_lo = vqtbl2q_u8(tables, std::mem::transmute(idx_lo));
278        let result_hi = vqtbl2q_u8(tables, std::mem::transmute(idx_hi));
279
280        // Transmute results directly to u8x16, then combine into u32x8
281        let lo: u8x16 = std::mem::transmute(result_lo);
282        let hi: u8x16 = std::mem::transmute(result_hi);
283
284        std::mem::transmute((lo, hi))
285    }
286}
287
288/// Gather-style compress for u32x8 - direct indexed writes to destination.
289/// Used as fallback on x86-64 without AVX-512 and other non-ARM architectures.
290#[cfg(not(target_arch = "aarch64"))]
291#[inline]
292fn compress_store_u32x8_gather(data: u32x8, mask: u8, dest: &mut [u32]) {
293    let arr = data.to_array();
294    let mut idx = 0;
295    if mask & (1 << 0) != 0 { dest[idx] = arr[0]; idx += 1; }
296    if mask & (1 << 1) != 0 { dest[idx] = arr[1]; idx += 1; }
297    if mask & (1 << 2) != 0 { dest[idx] = arr[2]; idx += 1; }
298    if mask & (1 << 3) != 0 { dest[idx] = arr[3]; idx += 1; }
299    if mask & (1 << 4) != 0 { dest[idx] = arr[4]; idx += 1; }
300    if mask & (1 << 5) != 0 { dest[idx] = arr[5]; idx += 1; }
301    if mask & (1 << 6) != 0 { dest[idx] = arr[6]; idx += 1; }
302    if mask & (1 << 7) != 0 { dest[idx] = arr[7]; }
303}
304
305// =============================================================================
306// u32x16 Compress Operations (512-bit)
307// =============================================================================
308
309/// Compress and store u32x16 elements where mask bits are set.
310///
311/// # Arguments
312/// * `data` - Source vector of 16 u32 values
313/// * `mask` - 16-bit mask where bit i selects element i
314/// * `dest` - Destination slice (**must have room for 16 elements**)
315///
316/// # Returns
317/// Number of elements written (equal to `mask.count_ones()`)
318///
319/// # Panics
320/// Panics if `dest.len() < 16`. The destination must have room for the full
321/// uncompressed vector since the mask is not known at compile time.
322#[inline]
323pub fn compress_store_u32x16(data: u32x16, mask: u16, dest: &mut [u32]) -> usize {
324    let count = mask.count_ones() as usize;
325    assert!(dest.len() >= 16, "destination buffer must have room for 16 elements");
326
327    #[cfg(target_arch = "x86_64")]
328    {
329        if is_x86_feature_detected!("avx512f") {
330            unsafe { compress_store_u32x16_avx512(data, mask, dest) };
331            return count;
332        }
333    }
334
335    // Fallback: split into two u32x8 halves and compress each
336    compress_store_u32x16_fallback(data, mask, dest);
337    count
338}
339
340/// Compress u32x16 and return both the compressed vector and element count.
341/// Unwritten lanes contain undefined values.
342#[inline]
343pub fn compress_u32x16(data: u32x16, mask: u16) -> (u32x16, usize) {
344    let count = mask.count_ones() as usize;
345
346    #[cfg(target_arch = "x86_64")]
347    {
348        if is_x86_feature_detected!("avx512f") {
349            let result = unsafe { compress_u32x16_avx512(data, mask) };
350            return (result, count);
351        }
352    }
353
354    // Fallback: use two u32x8 compress operations
355    let result = compress_u32x16_fallback_to_vec(data, mask);
356    (result, count)
357}
358
359#[cfg(target_arch = "x86_64")]
360#[inline]
361#[target_feature(enable = "avx512f")]
362unsafe fn compress_store_u32x16_avx512(data: u32x16, mask: u16, dest: &mut [u32]) {
363    unsafe {
364        let raw = std::mem::transmute::<u32x16, __m512i>(data);
365        _mm512_mask_compressstoreu_epi32(dest.as_mut_ptr() as *mut i32, mask, raw);
366    }
367}
368
369#[cfg(target_arch = "x86_64")]
370#[inline]
371#[target_feature(enable = "avx512f")]
372unsafe fn compress_u32x16_avx512(data: u32x16, mask: u16) -> u32x16 {
373    unsafe {
374        let raw = std::mem::transmute::<u32x16, __m512i>(data);
375        let compressed = _mm512_maskz_compress_epi32(mask, raw);
376        std::mem::transmute::<__m512i, u32x16>(compressed)
377    }
378}
379
380/// Fallback: compress u32x16 by splitting into two u32x8 halves using SimdSplit
381#[inline]
382fn compress_store_u32x16_fallback(data: u32x16, mask: u16, dest: &mut [u32]) {
383    // Use efficient SimdSplit to extract halves
384    let (lo, hi) = data.split_low_high();
385
386    let lo_mask = (mask & 0xFF) as u8;
387    let hi_mask = ((mask >> 8) & 0xFF) as u8;
388
389    // Compress low half
390    let lo_count = compress_store_u32x8(lo, lo_mask, dest);
391
392    // Compress high half, writing after the low results
393    let _ = compress_store_u32x8(hi, hi_mask, &mut dest[lo_count..]);
394}
395
396/// Fallback: compress u32x16 to vector using two u32x8 operations
397#[inline]
398fn compress_u32x16_fallback_to_vec(data: u32x16, mask: u16) -> u32x16 {
399    // Use efficient SimdSplit to extract halves
400    let (lo, hi) = data.split_low_high();
401
402    let lo_mask = (mask & 0xFF) as u8;
403    let hi_mask = ((mask >> 8) & 0xFF) as u8;
404
405    // Compress each half
406    let (lo_compressed, lo_count) = compress_u32x8(lo, lo_mask);
407    let (hi_compressed, hi_count) = compress_u32x8(hi, hi_mask);
408
409    // Combine results using slice operations
410    let lo_arr = lo_compressed.to_array();
411    let hi_arr = hi_compressed.to_array();
412
413    let mut result = [0u32; 16];
414
415    // Copy compressed low elements
416    result[..lo_count].copy_from_slice(&lo_arr[..lo_count]);
417
418    // Copy compressed high elements after low
419    let hi_copy_count = hi_count.min(16 - lo_count);
420    result[lo_count..lo_count + hi_copy_count].copy_from_slice(&hi_arr[..hi_copy_count]);
421
422    u32x16::from(result)
423}
424
425// =============================================================================
426// u8x16 Compress Operations
427// =============================================================================
428
429/// Compress and store u8x16 elements where mask bits are set.
430///
431/// # Arguments
432/// * `data` - Source vector of 16 u8 values
433/// * `mask` - 16-bit mask where bit i selects element i
434/// * `dest` - Destination slice (**must have room for 16 elements**)
435///
436/// # Returns
437/// Number of elements written (equal to `mask.count_ones()`)
438///
439/// # Panics
440/// Panics if `dest.len() < 16`. The destination must have room for the full
441/// uncompressed vector since the mask is not known at compile time.
442#[inline]
443pub fn compress_store_u8x16(data: u8x16, mask: u16, dest: &mut [u8]) -> usize {
444    let count = mask.count_ones() as usize;
445    assert!(dest.len() >= 16, "destination buffer must have room for 16 elements");
446
447    #[cfg(target_arch = "x86_64")]
448    {
449        // AVX512VBMI2 has native u8 compress
450        if is_x86_feature_detected!("avx512vbmi2") && is_x86_feature_detected!("avx512vl") {
451            unsafe { compress_store_u8x16_avx512(data, mask, dest) };
452            return count;
453        }
454        // Fallback for x86-64 without AVX-512VBMI2
455        compress_store_u8x16_gather(data, mask, dest);
456        return count;
457    }
458
459    #[cfg(target_arch = "aarch64")]
460    {
461        // Use NEON TBL shuffle - eliminates 16 conditional branches
462        unsafe { compress_store_u8x16_neon(data, mask, count, dest) };
463        return count;
464    }
465
466    // This should never be reached - all architectures have been handled above
467    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
468    {
469        compress_store_u8x16_gather(data, mask, dest);
470        count
471    }
472}
473
474/// Compress u8x16 and return both the compressed vector and element count.
475/// Unwritten lanes contain undefined values.
476#[inline]
477pub fn compress_u8x16(data: u8x16, mask: u16) -> (u8x16, usize) {
478    let count = mask.count_ones() as usize;
479
480    #[cfg(target_arch = "x86_64")]
481    {
482        if is_x86_feature_detected!("avx512vbmi2") && is_x86_feature_detected!("avx512vl") {
483            let result = unsafe { compress_u8x16_avx512(data, mask) };
484            return (result, count);
485        }
486    }
487
488    // Fallback: two-pass shuffle approach
489    let result = compress_u8x16_shuffle(data, mask);
490    (result, count)
491}
492
493#[cfg(target_arch = "x86_64")]
494#[inline]
495#[target_feature(enable = "avx512vbmi2", enable = "avx512vl")]
496unsafe fn compress_store_u8x16_avx512(data: u8x16, mask: u16, dest: &mut [u8]) {
497    unsafe {
498        let raw = std::mem::transmute::<u8x16, __m128i>(data);
499        _mm_mask_compressstoreu_epi8(dest.as_mut_ptr() as *mut i8, mask, raw);
500    }
501}
502
503#[cfg(target_arch = "x86_64")]
504#[inline]
505#[target_feature(enable = "avx512vbmi2", enable = "avx512vl")]
506unsafe fn compress_u8x16_avx512(data: u8x16, mask: u16) -> u8x16 {
507    unsafe {
508        let raw = std::mem::transmute::<u8x16, __m128i>(data);
509        let compressed = _mm_maskz_compress_epi8(mask, raw);
510        std::mem::transmute::<__m128i, u8x16>(compressed)
511    }
512}
513
514// =============================================================================
515// ARM NEON u8x16 Compress Implementations
516// =============================================================================
517
518/// Precomputed 16-byte shuffle indices for NEON TBL-based u8x16 compress.
519/// Stored as u8x16 for proper alignment and zero-cost transmute to NEON registers.
520/// Each entry contains byte indices that shuffle selected bytes to the front.
521#[cfg(target_arch = "aarch64")]
522static COMPRESS_BYTE_IDX_U8X16: [u8x16; 65536] = {
523    // Safety: u8x16 is repr(transparent) over [u8; 16], so transmute is safe
524    const fn arr_to_u8x16(arr: [u8; 16]) -> u8x16 {
525        unsafe { std::mem::transmute(arr) }
526    }
527
528    let mut table: [u8x16; 65536] = [arr_to_u8x16([0u8; 16]); 65536];
529    let mut mask = 0usize;
530    while mask < 65536 {
531        let mut indices = [0u8; 16];
532        let mut dest_pos = 0usize;
533        let mut src_pos = 0usize;
534        while src_pos < 16 {
535            if (mask >> src_pos) & 1 != 0 {
536                indices[dest_pos] = src_pos as u8;
537                dest_pos += 1;
538            }
539            src_pos += 1;
540        }
541        // Fill remaining with 0 (safe filler)
542        table[mask] = arr_to_u8x16(indices);
543        mask += 1;
544    }
545    table
546};
547
548/// NEON-optimized compress store for u8x16 using TBL instruction.
549/// Writes full 16 bytes directly; only first `count` elements are valid.
550/// Caller must ensure dest has room for 16 elements.
551#[cfg(target_arch = "aarch64")]
552#[inline]
553#[target_feature(enable = "neon")]
554unsafe fn compress_store_u8x16_neon(data: u8x16, mask: u16, _count: usize, dest: &mut [u8]) {
555    unsafe {
556        // Zero-cost transmutes - no load instructions needed
557        let data_vec: uint8x16_t = std::mem::transmute(data);
558        let idx_vec: uint8x16_t = std::mem::transmute(COMPRESS_BYTE_IDX_U8X16[mask as usize]);
559
560        // Single TBL instruction shuffles all 16 bytes
561        let result = vqtbl1q_u8(data_vec, idx_vec);
562
563        // Store full 16 bytes directly
564        vst1q_u8(dest.as_mut_ptr(), result);
565    }
566}
567
568/// Gather-style compress for u8x16 - direct indexed writes to destination.
569/// Used as fallback on x86-64 without AVX-512VBMI2 and other non-ARM architectures.
570#[cfg(not(target_arch = "aarch64"))]
571#[inline]
572fn compress_store_u8x16_gather(data: u8x16, mask: u16, dest: &mut [u8]) {
573    let arr = data.to_array();
574    let mut idx = 0;
575    // Unrolled gather: each element is conditionally written
576    // Compiler can optimize this to efficient indexed stores
577    if mask & (1 << 0) != 0 { dest[idx] = arr[0]; idx += 1; }
578    if mask & (1 << 1) != 0 { dest[idx] = arr[1]; idx += 1; }
579    if mask & (1 << 2) != 0 { dest[idx] = arr[2]; idx += 1; }
580    if mask & (1 << 3) != 0 { dest[idx] = arr[3]; idx += 1; }
581    if mask & (1 << 4) != 0 { dest[idx] = arr[4]; idx += 1; }
582    if mask & (1 << 5) != 0 { dest[idx] = arr[5]; idx += 1; }
583    if mask & (1 << 6) != 0 { dest[idx] = arr[6]; idx += 1; }
584    if mask & (1 << 7) != 0 { dest[idx] = arr[7]; idx += 1; }
585    if mask & (1 << 8) != 0 { dest[idx] = arr[8]; idx += 1; }
586    if mask & (1 << 9) != 0 { dest[idx] = arr[9]; idx += 1; }
587    if mask & (1 << 10) != 0 { dest[idx] = arr[10]; idx += 1; }
588    if mask & (1 << 11) != 0 { dest[idx] = arr[11]; idx += 1; }
589    if mask & (1 << 12) != 0 { dest[idx] = arr[12]; idx += 1; }
590    if mask & (1 << 13) != 0 { dest[idx] = arr[13]; idx += 1; }
591    if mask & (1 << 14) != 0 { dest[idx] = arr[14]; idx += 1; }
592    if mask & (1 << 15) != 0 { dest[idx] = arr[15]; }
593}
594
595/// Compress u8x16 using shuffle tables (used by compress_u8x16 which returns a vector).
596/// This is a two-pass approach that handles each 8-byte half separately.
597#[inline]
598fn compress_u8x16_shuffle(data: u8x16, mask: u16) -> u8x16 {
599    let lo_mask = (mask & 0xFF) as u8;
600    let hi_mask = ((mask >> 8) & 0xFF) as u8;
601
602    let lo_count = lo_mask.count_ones() as usize;
603    let hi_count = hi_mask.count_ones() as usize;
604
605    // Get shuffle indices for each half
606    let lo_indices = &SHUFFLE_COMPRESS_IDX_U8_LO[lo_mask as usize];
607    let hi_indices = &SHUFFLE_COMPRESS_IDX_U8_HI[hi_mask as usize];
608
609    // Build the full 16-byte shuffle index using slice operations
610    let mut indices = [0u8; 16];
611
612    // Copy low indices (variable count based on mask popcount)
613    indices[..lo_count].copy_from_slice(&lo_indices[..lo_count]);
614
615    // Copy high indices after low results
616    let hi_copy_count = hi_count.min(16 - lo_count);
617    indices[lo_count..lo_count + hi_copy_count].copy_from_slice(&hi_indices[..hi_copy_count]);
618
619    // Remaining positions stay 0 (safe filler)
620    data.shuffle(u8x16::from(indices))
621}
622
623// =============================================================================
624// Tests
625// =============================================================================
626
627#[cfg(test)]
628mod tests {
629    use super::*;
630
631    #[test]
632    fn test_compress_u32x8_basic() {
633        let data = u32x8::from([10, 20, 30, 40, 50, 60, 70, 80]);
634        let mask = 0b10110010u8;
635        let mut output = [0u32; 8];
636
637        let count = compress_store_u32x8(data, mask, &mut output);
638
639        assert_eq!(count, 4);
640        assert_eq!(output[0], 20);
641        assert_eq!(output[1], 50);
642        assert_eq!(output[2], 60);
643        assert_eq!(output[3], 80);
644    }
645
646    #[test]
647    fn test_compress_u32x8_all() {
648        let data = u32x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
649        let mask = 0xFFu8;
650        let mut output = [0u32; 8];
651
652        let count = compress_store_u32x8(data, mask, &mut output);
653
654        assert_eq!(count, 8);
655        assert_eq!(output, [1, 2, 3, 4, 5, 6, 7, 8]);
656    }
657
658    #[test]
659    fn test_compress_u32x8_none() {
660        let data = u32x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
661        let mask = 0x00u8;
662        let mut output = [0u32; 8];
663
664        let count = compress_store_u32x8(data, mask, &mut output);
665
666        assert_eq!(count, 0);
667    }
668
669    #[test]
670    fn test_compress_u32x8_first_only() {
671        let data = u32x8::from([42, 2, 3, 4, 5, 6, 7, 8]);
672        let mask = 0b00000001u8;
673        let mut output = [0u32; 8];
674
675        let count = compress_store_u32x8(data, mask, &mut output);
676
677        assert_eq!(count, 1);
678        assert_eq!(output[0], 42);
679    }
680
681    #[test]
682    fn test_compress_u32x8_last_only() {
683        let data = u32x8::from([1, 2, 3, 4, 5, 6, 7, 99]);
684        let mask = 0b10000000u8;
685        let mut output = [0u32; 8];
686
687        let count = compress_store_u32x8(data, mask, &mut output);
688
689        assert_eq!(count, 1);
690        assert_eq!(output[0], 99);
691    }
692
693    #[test]
694    fn test_compress_u8x16_basic() {
695        let data = u8x16::from([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
696        let mask = 0b1000000100000101u16;
697        let mut output = [0u8; 16];
698
699        let count = compress_store_u8x16(data, mask, &mut output);
700
701        assert_eq!(count, 4);
702        assert_eq!(output[0], 0);
703        assert_eq!(output[1], 2);
704        assert_eq!(output[2], 8);
705        assert_eq!(output[3], 15);
706    }
707
708    #[test]
709    fn test_compress_u8x16_all() {
710        let data = u8x16::from([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
711        let mask = 0xFFFFu16;
712        let mut output = [0u8; 16];
713
714        let count = compress_store_u8x16(data, mask, &mut output);
715
716        assert_eq!(count, 16);
717        assert_eq!(output, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
718    }
719
720    #[test]
721    fn test_compress_u8x16_none() {
722        let data = u8x16::from([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
723        let mask = 0x0000u16;
724        let mut output = [0u8; 16];
725
726        let count = compress_store_u8x16(data, mask, &mut output);
727
728        assert_eq!(count, 0);
729    }
730
731    #[test]
732    fn test_compress_u8x16_low_half_only() {
733        let data = u8x16::from([10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160]);
734        let mask = 0b0000000010101010u16;
735        let mut output = [0u8; 16];
736
737        let count = compress_store_u8x16(data, mask, &mut output);
738
739        assert_eq!(count, 4);
740        assert_eq!(output[0], 20);
741        assert_eq!(output[1], 40);
742        assert_eq!(output[2], 60);
743        assert_eq!(output[3], 80);
744    }
745
746    #[test]
747    fn test_compress_u8x16_high_half_only() {
748        let data = u8x16::from([10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160]);
749        let mask = 0b0101010100000000u16;
750        let mut output = [0u8; 16];
751
752        let count = compress_store_u8x16(data, mask, &mut output);
753
754        assert_eq!(count, 4);
755        assert_eq!(output[0], 90);
756        assert_eq!(output[1], 110);
757        assert_eq!(output[2], 130);
758        assert_eq!(output[3], 150);
759    }
760
761    #[test]
762    fn test_compress_u32x8_return_vector() {
763        let data = u32x8::from([10, 20, 30, 40, 50, 60, 70, 80]);
764        let mask = 0b10110010u8;
765
766        let (result, count) = compress_u32x8(data, mask);
767        let arr = result.to_array();
768
769        assert_eq!(count, 4);
770        assert_eq!(arr[0], 20);
771        assert_eq!(arr[1], 50);
772        assert_eq!(arr[2], 60);
773        assert_eq!(arr[3], 80);
774    }
775
776    #[test]
777    fn test_compress_u8x16_return_vector() {
778        let data = u8x16::from([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
779        let mask = 0b1000000100000101u16;
780
781        let (result, count) = compress_u8x16(data, mask);
782        let arr = result.to_array();
783
784        assert_eq!(count, 4);
785        assert_eq!(arr[0], 0);
786        assert_eq!(arr[1], 2);
787        assert_eq!(arr[2], 8);
788        assert_eq!(arr[3], 15);
789    }
790
791    // =========================================================================
792    // u32x16 Tests
793    // =========================================================================
794
795    #[test]
796    fn test_compress_u32x16_basic() {
797        let data = u32x16::from([
798            10, 20, 30, 40, 50, 60, 70, 80,
799            90, 100, 110, 120, 130, 140, 150, 160
800        ]);
801        let mask = 0b1000000110110010u16;
802        let mut output = [0u32; 16];
803
804        let count = compress_store_u32x16(data, mask, &mut output);
805
806        assert_eq!(count, 6);
807        assert_eq!(output[0], 20);
808        assert_eq!(output[1], 50);
809        assert_eq!(output[2], 60);
810        assert_eq!(output[3], 80);
811        assert_eq!(output[4], 90);
812        assert_eq!(output[5], 160);
813    }
814
815    #[test]
816    fn test_compress_u32x16_all() {
817        let data = u32x16::from([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
818        let mask = 0xFFFFu16;
819        let mut output = [0u32; 16];
820
821        let count = compress_store_u32x16(data, mask, &mut output);
822
823        assert_eq!(count, 16);
824        assert_eq!(output, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
825    }
826
827    #[test]
828    fn test_compress_u32x16_none() {
829        let data = u32x16::from([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
830        let mask = 0x0000u16;
831        let mut output = [0u32; 16];
832
833        let count = compress_store_u32x16(data, mask, &mut output);
834
835        assert_eq!(count, 0);
836    }
837
838    #[test]
839    fn test_compress_u32x16_low_half_only() {
840        let data = u32x16::from([
841            10, 20, 30, 40, 50, 60, 70, 80,
842            90, 100, 110, 120, 130, 140, 150, 160
843        ]);
844        let mask = 0b0000000001010101u16;
845        let mut output = [0u32; 16];
846
847        let count = compress_store_u32x16(data, mask, &mut output);
848
849        assert_eq!(count, 4);
850        assert_eq!(output[0], 10);
851        assert_eq!(output[1], 30);
852        assert_eq!(output[2], 50);
853        assert_eq!(output[3], 70);
854    }
855
856    #[test]
857    fn test_compress_u32x16_high_half_only() {
858        let data = u32x16::from([
859            10, 20, 30, 40, 50, 60, 70, 80,
860            90, 100, 110, 120, 130, 140, 150, 160
861        ]);
862        let mask = 0b0101010100000000u16;
863        let mut output = [0u32; 16];
864
865        let count = compress_store_u32x16(data, mask, &mut output);
866
867        assert_eq!(count, 4);
868        assert_eq!(output[0], 90);
869        assert_eq!(output[1], 110);
870        assert_eq!(output[2], 130);
871        assert_eq!(output[3], 150);
872    }
873
874    #[test]
875    fn test_compress_u32x16_return_vector() {
876        let data = u32x16::from([
877            10, 20, 30, 40, 50, 60, 70, 80,
878            90, 100, 110, 120, 130, 140, 150, 160
879        ]);
880        let mask = 0b1000000110110010u16;
881
882        let (result, count) = compress_u32x16(data, mask);
883        let arr = result.to_array();
884
885        assert_eq!(count, 6);
886        assert_eq!(arr[0], 20);
887        assert_eq!(arr[1], 50);
888        assert_eq!(arr[2], 60);
889        assert_eq!(arr[3], 80);
890        assert_eq!(arr[4], 90);
891        assert_eq!(arr[5], 160);
892    }
893
894    #[test]
895    fn test_compress_u32x16_first_and_last() {
896        let data = u32x16::from([
897            100, 0, 0, 0, 0, 0, 0, 0,
898            0, 0, 0, 0, 0, 0, 0, 200
899        ]);
900        let mask = 0b1000000000000001u16;
901        let mut output = [0u32; 16];
902
903        let count = compress_store_u32x16(data, mask, &mut output);
904
905        assert_eq!(count, 2);
906        assert_eq!(output[0], 100);
907        assert_eq!(output[1], 200);
908    }
909
910    // =========================================================================
911    // Buffer size requirement tests
912    // =========================================================================
913
914    #[test]
915    #[should_panic(expected = "destination buffer must have room for 8 elements")]
916    fn test_compress_u32x8_panics_on_small_buffer() {
917        let data = u32x8::from([10, 20, 30, 40, 50, 60, 70, 80]);
918        let mask = 0b10110010u8;
919        let mut output = [0u32; 4]; // Too small!
920        compress_store_u32x8(data, mask, &mut output);
921    }
922
923    #[test]
924    #[should_panic(expected = "destination buffer must have room for 16 elements")]
925    fn test_compress_u8x16_panics_on_small_buffer() {
926        let data = u8x16::from([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
927        let mask = 0b1000000100000101u16;
928        let mut output = [0u8; 8]; // Too small!
929        compress_store_u8x16(data, mask, &mut output);
930    }
931
932    #[test]
933    #[should_panic(expected = "destination buffer must have room for 16 elements")]
934    fn test_compress_u32x16_panics_on_small_buffer() {
935        let data = u32x16::from([
936            10, 20, 30, 40, 50, 60, 70, 80,
937            90, 100, 110, 120, 130, 140, 150, 160
938        ]);
939        let mask = 0b1000000110110010u16;
940        let mut output = [0u32; 8]; // Too small!
941        compress_store_u32x16(data, mask, &mut output);
942    }
943}