1use wide::{u8x16, u32x8, u32x16};
70
71#[cfg(target_arch = "x86_64")]
72use std::arch::x86_64::*;
73
74#[cfg(target_arch = "x86_64")]
75use std::arch::is_x86_feature_detected;
76
77#[cfg(target_arch = "aarch64")]
78use std::arch::aarch64::*;
79
80use crate::wide_utils::{
81 SimdSplit, WideUtilsExt,
82 SHUFFLE_COMPRESS_IDX_U8_HI, SHUFFLE_COMPRESS_IDX_U8_LO,
83};
84
85#[cfg(not(target_arch = "aarch64"))]
86use crate::wide_utils::get_compress_indices_u32x8;
87
88#[inline]
106pub fn compress_store_u32x8(data: u32x8, mask: u8, dest: &mut [u32]) -> usize {
107 let count = mask.count_ones() as usize;
108 assert!(dest.len() >= 8, "destination buffer must have room for 8 elements");
109
110 #[cfg(target_arch = "x86_64")]
111 {
112 if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") {
113 unsafe { compress_store_u32x8_avx512(data, mask, dest) };
114 return count;
115 }
116 compress_store_u32x8_gather(data, mask, dest);
118 return count;
119 }
120
121 #[cfg(target_arch = "aarch64")]
122 {
123 unsafe { compress_store_u32x8_neon(data, mask, count, dest) };
125 return count;
126 }
127
128 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
130 {
131 compress_store_u32x8_gather(data, mask, dest);
132 count
133 }
134}
135
136#[inline]
139pub fn compress_u32x8(data: u32x8, mask: u8) -> (u32x8, usize) {
140 let count = mask.count_ones() as usize;
141
142 #[cfg(target_arch = "x86_64")]
143 {
144 if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") {
145 let result = unsafe { compress_u32x8_avx512(data, mask) };
146 return (result, count);
147 }
148 let indices = get_compress_indices_u32x8(mask);
150 let result = data.shuffle(indices);
151 return (result, count);
152 }
153
154 #[cfg(target_arch = "aarch64")]
155 {
156 let result = unsafe { compress_u32x8_neon_vec(data, mask) };
158 return (result, count);
159 }
160
161 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
163 {
164 let indices = get_compress_indices_u32x8(mask);
165 let result = data.shuffle(indices);
166 (result, count)
167 }
168}
169
170#[cfg(target_arch = "x86_64")]
171#[inline]
172#[target_feature(enable = "avx512f", enable = "avx512vl")]
173unsafe fn compress_store_u32x8_avx512(data: u32x8, mask: u8, dest: &mut [u32]) {
174 unsafe {
175 let raw = std::mem::transmute::<u32x8, __m256i>(data);
176 _mm256_mask_compressstoreu_epi32(dest.as_mut_ptr() as *mut i32, mask, raw);
177 }
178}
179
180#[cfg(target_arch = "x86_64")]
181#[inline]
182#[target_feature(enable = "avx512f", enable = "avx512vl")]
183unsafe fn compress_u32x8_avx512(data: u32x8, mask: u8) -> u32x8 {
184 unsafe {
185 let raw = std::mem::transmute::<u32x8, __m256i>(data);
186 let compressed = _mm256_maskz_compress_epi32(mask, raw);
187 std::mem::transmute::<__m256i, u32x8>(compressed)
188 }
189}
190
191#[cfg(target_arch = "aarch64")]
199static COMPRESS_BYTE_IDX_U32X8: [(u8x16, u8x16); 256] = {
200 const fn arr_to_u8x16(arr: [u8; 16]) -> u8x16 {
202 unsafe { std::mem::transmute(arr) }
203 }
204
205 let mut table: [(u8x16, u8x16); 256] = [(arr_to_u8x16([0u8; 16]), arr_to_u8x16([0u8; 16])); 256];
206 let mut mask = 0usize;
207 while mask < 256 {
208 let mut indices_lo = [0u8; 16];
209 let mut indices_hi = [0u8; 16];
210 let mut dest_pos = 0usize;
211 let mut src_pos = 0usize;
212 while src_pos < 8 {
213 if (mask >> src_pos) & 1 != 0 {
214 let byte_base = (src_pos * 4) as u8;
216 let dest_base = dest_pos * 4;
217 if dest_base < 16 {
218 indices_lo[dest_base] = byte_base;
219 indices_lo[dest_base + 1] = byte_base + 1;
220 indices_lo[dest_base + 2] = byte_base + 2;
221 indices_lo[dest_base + 3] = byte_base + 3;
222 } else {
223 let hi_base = dest_base - 16;
224 indices_hi[hi_base] = byte_base;
225 indices_hi[hi_base + 1] = byte_base + 1;
226 indices_hi[hi_base + 2] = byte_base + 2;
227 indices_hi[hi_base + 3] = byte_base + 3;
228 }
229 dest_pos += 1;
230 }
231 src_pos += 1;
232 }
233 table[mask] = (arr_to_u8x16(indices_lo), arr_to_u8x16(indices_hi));
234 mask += 1;
235 }
236 table
237};
238
239#[cfg(target_arch = "aarch64")]
243#[inline]
244#[target_feature(enable = "neon")]
245unsafe fn compress_store_u32x8_neon(data: u32x8, mask: u8, _count: usize, dest: &mut [u32]) {
246 unsafe {
247 let (idx_lo, idx_hi) = COMPRESS_BYTE_IDX_U32X8[mask as usize];
248
249 let (data_lo, data_hi): (u8x16, u8x16) = std::mem::transmute(data);
251 let tables = uint8x16x2_t(std::mem::transmute(data_lo), std::mem::transmute(data_hi));
252
253 let result_lo = vqtbl2q_u8(tables, std::mem::transmute(idx_lo));
255 let result_hi = vqtbl2q_u8(tables, std::mem::transmute(idx_hi));
256
257 let dest_ptr = dest.as_mut_ptr() as *mut u8;
259 vst1q_u8(dest_ptr, result_lo);
260 vst1q_u8(dest_ptr.add(16), result_hi);
261 }
262}
263
264#[cfg(target_arch = "aarch64")]
266#[inline]
267#[target_feature(enable = "neon")]
268unsafe fn compress_u32x8_neon_vec(data: u32x8, mask: u8) -> u32x8 {
269 unsafe {
270 let (idx_lo, idx_hi) = COMPRESS_BYTE_IDX_U32X8[mask as usize];
271
272 let (data_lo, data_hi): (u8x16, u8x16) = std::mem::transmute(data);
274 let tables = uint8x16x2_t(std::mem::transmute(data_lo), std::mem::transmute(data_hi));
275
276 let result_lo = vqtbl2q_u8(tables, std::mem::transmute(idx_lo));
278 let result_hi = vqtbl2q_u8(tables, std::mem::transmute(idx_hi));
279
280 let lo: u8x16 = std::mem::transmute(result_lo);
282 let hi: u8x16 = std::mem::transmute(result_hi);
283
284 std::mem::transmute((lo, hi))
285 }
286}
287
288#[cfg(not(target_arch = "aarch64"))]
291#[inline]
292fn compress_store_u32x8_gather(data: u32x8, mask: u8, dest: &mut [u32]) {
293 let arr = data.to_array();
294 let mut idx = 0;
295 if mask & (1 << 0) != 0 { dest[idx] = arr[0]; idx += 1; }
296 if mask & (1 << 1) != 0 { dest[idx] = arr[1]; idx += 1; }
297 if mask & (1 << 2) != 0 { dest[idx] = arr[2]; idx += 1; }
298 if mask & (1 << 3) != 0 { dest[idx] = arr[3]; idx += 1; }
299 if mask & (1 << 4) != 0 { dest[idx] = arr[4]; idx += 1; }
300 if mask & (1 << 5) != 0 { dest[idx] = arr[5]; idx += 1; }
301 if mask & (1 << 6) != 0 { dest[idx] = arr[6]; idx += 1; }
302 if mask & (1 << 7) != 0 { dest[idx] = arr[7]; }
303}
304
305#[inline]
323pub fn compress_store_u32x16(data: u32x16, mask: u16, dest: &mut [u32]) -> usize {
324 let count = mask.count_ones() as usize;
325 assert!(dest.len() >= 16, "destination buffer must have room for 16 elements");
326
327 #[cfg(target_arch = "x86_64")]
328 {
329 if is_x86_feature_detected!("avx512f") {
330 unsafe { compress_store_u32x16_avx512(data, mask, dest) };
331 return count;
332 }
333 }
334
335 compress_store_u32x16_fallback(data, mask, dest);
337 count
338}
339
340#[inline]
343pub fn compress_u32x16(data: u32x16, mask: u16) -> (u32x16, usize) {
344 let count = mask.count_ones() as usize;
345
346 #[cfg(target_arch = "x86_64")]
347 {
348 if is_x86_feature_detected!("avx512f") {
349 let result = unsafe { compress_u32x16_avx512(data, mask) };
350 return (result, count);
351 }
352 }
353
354 let result = compress_u32x16_fallback_to_vec(data, mask);
356 (result, count)
357}
358
359#[cfg(target_arch = "x86_64")]
360#[inline]
361#[target_feature(enable = "avx512f")]
362unsafe fn compress_store_u32x16_avx512(data: u32x16, mask: u16, dest: &mut [u32]) {
363 unsafe {
364 let raw = std::mem::transmute::<u32x16, __m512i>(data);
365 _mm512_mask_compressstoreu_epi32(dest.as_mut_ptr() as *mut i32, mask, raw);
366 }
367}
368
369#[cfg(target_arch = "x86_64")]
370#[inline]
371#[target_feature(enable = "avx512f")]
372unsafe fn compress_u32x16_avx512(data: u32x16, mask: u16) -> u32x16 {
373 unsafe {
374 let raw = std::mem::transmute::<u32x16, __m512i>(data);
375 let compressed = _mm512_maskz_compress_epi32(mask, raw);
376 std::mem::transmute::<__m512i, u32x16>(compressed)
377 }
378}
379
380#[inline]
382fn compress_store_u32x16_fallback(data: u32x16, mask: u16, dest: &mut [u32]) {
383 let (lo, hi) = data.split_low_high();
385
386 let lo_mask = (mask & 0xFF) as u8;
387 let hi_mask = ((mask >> 8) & 0xFF) as u8;
388
389 let lo_count = compress_store_u32x8(lo, lo_mask, dest);
391
392 let _ = compress_store_u32x8(hi, hi_mask, &mut dest[lo_count..]);
394}
395
396#[inline]
398fn compress_u32x16_fallback_to_vec(data: u32x16, mask: u16) -> u32x16 {
399 let (lo, hi) = data.split_low_high();
401
402 let lo_mask = (mask & 0xFF) as u8;
403 let hi_mask = ((mask >> 8) & 0xFF) as u8;
404
405 let (lo_compressed, lo_count) = compress_u32x8(lo, lo_mask);
407 let (hi_compressed, hi_count) = compress_u32x8(hi, hi_mask);
408
409 let lo_arr = lo_compressed.to_array();
411 let hi_arr = hi_compressed.to_array();
412
413 let mut result = [0u32; 16];
414
415 result[..lo_count].copy_from_slice(&lo_arr[..lo_count]);
417
418 let hi_copy_count = hi_count.min(16 - lo_count);
420 result[lo_count..lo_count + hi_copy_count].copy_from_slice(&hi_arr[..hi_copy_count]);
421
422 u32x16::from(result)
423}
424
425#[inline]
443pub fn compress_store_u8x16(data: u8x16, mask: u16, dest: &mut [u8]) -> usize {
444 let count = mask.count_ones() as usize;
445 assert!(dest.len() >= 16, "destination buffer must have room for 16 elements");
446
447 #[cfg(target_arch = "x86_64")]
448 {
449 if is_x86_feature_detected!("avx512vbmi2") && is_x86_feature_detected!("avx512vl") {
451 unsafe { compress_store_u8x16_avx512(data, mask, dest) };
452 return count;
453 }
454 compress_store_u8x16_gather(data, mask, dest);
456 return count;
457 }
458
459 #[cfg(target_arch = "aarch64")]
460 {
461 unsafe { compress_store_u8x16_neon(data, mask, count, dest) };
463 return count;
464 }
465
466 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
468 {
469 compress_store_u8x16_gather(data, mask, dest);
470 count
471 }
472}
473
474#[inline]
477pub fn compress_u8x16(data: u8x16, mask: u16) -> (u8x16, usize) {
478 let count = mask.count_ones() as usize;
479
480 #[cfg(target_arch = "x86_64")]
481 {
482 if is_x86_feature_detected!("avx512vbmi2") && is_x86_feature_detected!("avx512vl") {
483 let result = unsafe { compress_u8x16_avx512(data, mask) };
484 return (result, count);
485 }
486 }
487
488 let result = compress_u8x16_shuffle(data, mask);
490 (result, count)
491}
492
493#[cfg(target_arch = "x86_64")]
494#[inline]
495#[target_feature(enable = "avx512vbmi2", enable = "avx512vl")]
496unsafe fn compress_store_u8x16_avx512(data: u8x16, mask: u16, dest: &mut [u8]) {
497 unsafe {
498 let raw = std::mem::transmute::<u8x16, __m128i>(data);
499 _mm_mask_compressstoreu_epi8(dest.as_mut_ptr() as *mut i8, mask, raw);
500 }
501}
502
503#[cfg(target_arch = "x86_64")]
504#[inline]
505#[target_feature(enable = "avx512vbmi2", enable = "avx512vl")]
506unsafe fn compress_u8x16_avx512(data: u8x16, mask: u16) -> u8x16 {
507 unsafe {
508 let raw = std::mem::transmute::<u8x16, __m128i>(data);
509 let compressed = _mm_maskz_compress_epi8(mask, raw);
510 std::mem::transmute::<__m128i, u8x16>(compressed)
511 }
512}
513
514#[cfg(target_arch = "aarch64")]
522static COMPRESS_BYTE_IDX_U8X16: [u8x16; 65536] = {
523 const fn arr_to_u8x16(arr: [u8; 16]) -> u8x16 {
525 unsafe { std::mem::transmute(arr) }
526 }
527
528 let mut table: [u8x16; 65536] = [arr_to_u8x16([0u8; 16]); 65536];
529 let mut mask = 0usize;
530 while mask < 65536 {
531 let mut indices = [0u8; 16];
532 let mut dest_pos = 0usize;
533 let mut src_pos = 0usize;
534 while src_pos < 16 {
535 if (mask >> src_pos) & 1 != 0 {
536 indices[dest_pos] = src_pos as u8;
537 dest_pos += 1;
538 }
539 src_pos += 1;
540 }
541 table[mask] = arr_to_u8x16(indices);
543 mask += 1;
544 }
545 table
546};
547
548#[cfg(target_arch = "aarch64")]
552#[inline]
553#[target_feature(enable = "neon")]
554unsafe fn compress_store_u8x16_neon(data: u8x16, mask: u16, _count: usize, dest: &mut [u8]) {
555 unsafe {
556 let data_vec: uint8x16_t = std::mem::transmute(data);
558 let idx_vec: uint8x16_t = std::mem::transmute(COMPRESS_BYTE_IDX_U8X16[mask as usize]);
559
560 let result = vqtbl1q_u8(data_vec, idx_vec);
562
563 vst1q_u8(dest.as_mut_ptr(), result);
565 }
566}
567
568#[cfg(not(target_arch = "aarch64"))]
571#[inline]
572fn compress_store_u8x16_gather(data: u8x16, mask: u16, dest: &mut [u8]) {
573 let arr = data.to_array();
574 let mut idx = 0;
575 if mask & (1 << 0) != 0 { dest[idx] = arr[0]; idx += 1; }
578 if mask & (1 << 1) != 0 { dest[idx] = arr[1]; idx += 1; }
579 if mask & (1 << 2) != 0 { dest[idx] = arr[2]; idx += 1; }
580 if mask & (1 << 3) != 0 { dest[idx] = arr[3]; idx += 1; }
581 if mask & (1 << 4) != 0 { dest[idx] = arr[4]; idx += 1; }
582 if mask & (1 << 5) != 0 { dest[idx] = arr[5]; idx += 1; }
583 if mask & (1 << 6) != 0 { dest[idx] = arr[6]; idx += 1; }
584 if mask & (1 << 7) != 0 { dest[idx] = arr[7]; idx += 1; }
585 if mask & (1 << 8) != 0 { dest[idx] = arr[8]; idx += 1; }
586 if mask & (1 << 9) != 0 { dest[idx] = arr[9]; idx += 1; }
587 if mask & (1 << 10) != 0 { dest[idx] = arr[10]; idx += 1; }
588 if mask & (1 << 11) != 0 { dest[idx] = arr[11]; idx += 1; }
589 if mask & (1 << 12) != 0 { dest[idx] = arr[12]; idx += 1; }
590 if mask & (1 << 13) != 0 { dest[idx] = arr[13]; idx += 1; }
591 if mask & (1 << 14) != 0 { dest[idx] = arr[14]; idx += 1; }
592 if mask & (1 << 15) != 0 { dest[idx] = arr[15]; }
593}
594
595#[inline]
598fn compress_u8x16_shuffle(data: u8x16, mask: u16) -> u8x16 {
599 let lo_mask = (mask & 0xFF) as u8;
600 let hi_mask = ((mask >> 8) & 0xFF) as u8;
601
602 let lo_count = lo_mask.count_ones() as usize;
603 let hi_count = hi_mask.count_ones() as usize;
604
605 let lo_indices = &SHUFFLE_COMPRESS_IDX_U8_LO[lo_mask as usize];
607 let hi_indices = &SHUFFLE_COMPRESS_IDX_U8_HI[hi_mask as usize];
608
609 let mut indices = [0u8; 16];
611
612 indices[..lo_count].copy_from_slice(&lo_indices[..lo_count]);
614
615 let hi_copy_count = hi_count.min(16 - lo_count);
617 indices[lo_count..lo_count + hi_copy_count].copy_from_slice(&hi_indices[..hi_copy_count]);
618
619 data.shuffle(u8x16::from(indices))
621}
622
623#[cfg(test)]
628mod tests {
629 use super::*;
630
631 #[test]
632 fn test_compress_u32x8_basic() {
633 let data = u32x8::from([10, 20, 30, 40, 50, 60, 70, 80]);
634 let mask = 0b10110010u8;
635 let mut output = [0u32; 8];
636
637 let count = compress_store_u32x8(data, mask, &mut output);
638
639 assert_eq!(count, 4);
640 assert_eq!(output[0], 20);
641 assert_eq!(output[1], 50);
642 assert_eq!(output[2], 60);
643 assert_eq!(output[3], 80);
644 }
645
646 #[test]
647 fn test_compress_u32x8_all() {
648 let data = u32x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
649 let mask = 0xFFu8;
650 let mut output = [0u32; 8];
651
652 let count = compress_store_u32x8(data, mask, &mut output);
653
654 assert_eq!(count, 8);
655 assert_eq!(output, [1, 2, 3, 4, 5, 6, 7, 8]);
656 }
657
658 #[test]
659 fn test_compress_u32x8_none() {
660 let data = u32x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
661 let mask = 0x00u8;
662 let mut output = [0u32; 8];
663
664 let count = compress_store_u32x8(data, mask, &mut output);
665
666 assert_eq!(count, 0);
667 }
668
669 #[test]
670 fn test_compress_u32x8_first_only() {
671 let data = u32x8::from([42, 2, 3, 4, 5, 6, 7, 8]);
672 let mask = 0b00000001u8;
673 let mut output = [0u32; 8];
674
675 let count = compress_store_u32x8(data, mask, &mut output);
676
677 assert_eq!(count, 1);
678 assert_eq!(output[0], 42);
679 }
680
681 #[test]
682 fn test_compress_u32x8_last_only() {
683 let data = u32x8::from([1, 2, 3, 4, 5, 6, 7, 99]);
684 let mask = 0b10000000u8;
685 let mut output = [0u32; 8];
686
687 let count = compress_store_u32x8(data, mask, &mut output);
688
689 assert_eq!(count, 1);
690 assert_eq!(output[0], 99);
691 }
692
693 #[test]
694 fn test_compress_u8x16_basic() {
695 let data = u8x16::from([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
696 let mask = 0b1000000100000101u16;
697 let mut output = [0u8; 16];
698
699 let count = compress_store_u8x16(data, mask, &mut output);
700
701 assert_eq!(count, 4);
702 assert_eq!(output[0], 0);
703 assert_eq!(output[1], 2);
704 assert_eq!(output[2], 8);
705 assert_eq!(output[3], 15);
706 }
707
708 #[test]
709 fn test_compress_u8x16_all() {
710 let data = u8x16::from([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
711 let mask = 0xFFFFu16;
712 let mut output = [0u8; 16];
713
714 let count = compress_store_u8x16(data, mask, &mut output);
715
716 assert_eq!(count, 16);
717 assert_eq!(output, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
718 }
719
720 #[test]
721 fn test_compress_u8x16_none() {
722 let data = u8x16::from([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
723 let mask = 0x0000u16;
724 let mut output = [0u8; 16];
725
726 let count = compress_store_u8x16(data, mask, &mut output);
727
728 assert_eq!(count, 0);
729 }
730
731 #[test]
732 fn test_compress_u8x16_low_half_only() {
733 let data = u8x16::from([10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160]);
734 let mask = 0b0000000010101010u16;
735 let mut output = [0u8; 16];
736
737 let count = compress_store_u8x16(data, mask, &mut output);
738
739 assert_eq!(count, 4);
740 assert_eq!(output[0], 20);
741 assert_eq!(output[1], 40);
742 assert_eq!(output[2], 60);
743 assert_eq!(output[3], 80);
744 }
745
746 #[test]
747 fn test_compress_u8x16_high_half_only() {
748 let data = u8x16::from([10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160]);
749 let mask = 0b0101010100000000u16;
750 let mut output = [0u8; 16];
751
752 let count = compress_store_u8x16(data, mask, &mut output);
753
754 assert_eq!(count, 4);
755 assert_eq!(output[0], 90);
756 assert_eq!(output[1], 110);
757 assert_eq!(output[2], 130);
758 assert_eq!(output[3], 150);
759 }
760
761 #[test]
762 fn test_compress_u32x8_return_vector() {
763 let data = u32x8::from([10, 20, 30, 40, 50, 60, 70, 80]);
764 let mask = 0b10110010u8;
765
766 let (result, count) = compress_u32x8(data, mask);
767 let arr = result.to_array();
768
769 assert_eq!(count, 4);
770 assert_eq!(arr[0], 20);
771 assert_eq!(arr[1], 50);
772 assert_eq!(arr[2], 60);
773 assert_eq!(arr[3], 80);
774 }
775
776 #[test]
777 fn test_compress_u8x16_return_vector() {
778 let data = u8x16::from([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
779 let mask = 0b1000000100000101u16;
780
781 let (result, count) = compress_u8x16(data, mask);
782 let arr = result.to_array();
783
784 assert_eq!(count, 4);
785 assert_eq!(arr[0], 0);
786 assert_eq!(arr[1], 2);
787 assert_eq!(arr[2], 8);
788 assert_eq!(arr[3], 15);
789 }
790
791 #[test]
796 fn test_compress_u32x16_basic() {
797 let data = u32x16::from([
798 10, 20, 30, 40, 50, 60, 70, 80,
799 90, 100, 110, 120, 130, 140, 150, 160
800 ]);
801 let mask = 0b1000000110110010u16;
802 let mut output = [0u32; 16];
803
804 let count = compress_store_u32x16(data, mask, &mut output);
805
806 assert_eq!(count, 6);
807 assert_eq!(output[0], 20);
808 assert_eq!(output[1], 50);
809 assert_eq!(output[2], 60);
810 assert_eq!(output[3], 80);
811 assert_eq!(output[4], 90);
812 assert_eq!(output[5], 160);
813 }
814
815 #[test]
816 fn test_compress_u32x16_all() {
817 let data = u32x16::from([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
818 let mask = 0xFFFFu16;
819 let mut output = [0u32; 16];
820
821 let count = compress_store_u32x16(data, mask, &mut output);
822
823 assert_eq!(count, 16);
824 assert_eq!(output, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
825 }
826
827 #[test]
828 fn test_compress_u32x16_none() {
829 let data = u32x16::from([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
830 let mask = 0x0000u16;
831 let mut output = [0u32; 16];
832
833 let count = compress_store_u32x16(data, mask, &mut output);
834
835 assert_eq!(count, 0);
836 }
837
838 #[test]
839 fn test_compress_u32x16_low_half_only() {
840 let data = u32x16::from([
841 10, 20, 30, 40, 50, 60, 70, 80,
842 90, 100, 110, 120, 130, 140, 150, 160
843 ]);
844 let mask = 0b0000000001010101u16;
845 let mut output = [0u32; 16];
846
847 let count = compress_store_u32x16(data, mask, &mut output);
848
849 assert_eq!(count, 4);
850 assert_eq!(output[0], 10);
851 assert_eq!(output[1], 30);
852 assert_eq!(output[2], 50);
853 assert_eq!(output[3], 70);
854 }
855
856 #[test]
857 fn test_compress_u32x16_high_half_only() {
858 let data = u32x16::from([
859 10, 20, 30, 40, 50, 60, 70, 80,
860 90, 100, 110, 120, 130, 140, 150, 160
861 ]);
862 let mask = 0b0101010100000000u16;
863 let mut output = [0u32; 16];
864
865 let count = compress_store_u32x16(data, mask, &mut output);
866
867 assert_eq!(count, 4);
868 assert_eq!(output[0], 90);
869 assert_eq!(output[1], 110);
870 assert_eq!(output[2], 130);
871 assert_eq!(output[3], 150);
872 }
873
874 #[test]
875 fn test_compress_u32x16_return_vector() {
876 let data = u32x16::from([
877 10, 20, 30, 40, 50, 60, 70, 80,
878 90, 100, 110, 120, 130, 140, 150, 160
879 ]);
880 let mask = 0b1000000110110010u16;
881
882 let (result, count) = compress_u32x16(data, mask);
883 let arr = result.to_array();
884
885 assert_eq!(count, 6);
886 assert_eq!(arr[0], 20);
887 assert_eq!(arr[1], 50);
888 assert_eq!(arr[2], 60);
889 assert_eq!(arr[3], 80);
890 assert_eq!(arr[4], 90);
891 assert_eq!(arr[5], 160);
892 }
893
894 #[test]
895 fn test_compress_u32x16_first_and_last() {
896 let data = u32x16::from([
897 100, 0, 0, 0, 0, 0, 0, 0,
898 0, 0, 0, 0, 0, 0, 0, 200
899 ]);
900 let mask = 0b1000000000000001u16;
901 let mut output = [0u32; 16];
902
903 let count = compress_store_u32x16(data, mask, &mut output);
904
905 assert_eq!(count, 2);
906 assert_eq!(output[0], 100);
907 assert_eq!(output[1], 200);
908 }
909
910 #[test]
915 #[should_panic(expected = "destination buffer must have room for 8 elements")]
916 fn test_compress_u32x8_panics_on_small_buffer() {
917 let data = u32x8::from([10, 20, 30, 40, 50, 60, 70, 80]);
918 let mask = 0b10110010u8;
919 let mut output = [0u32; 4]; compress_store_u32x8(data, mask, &mut output);
921 }
922
923 #[test]
924 #[should_panic(expected = "destination buffer must have room for 16 elements")]
925 fn test_compress_u8x16_panics_on_small_buffer() {
926 let data = u8x16::from([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
927 let mask = 0b1000000100000101u16;
928 let mut output = [0u8; 8]; compress_store_u8x16(data, mask, &mut output);
930 }
931
932 #[test]
933 #[should_panic(expected = "destination buffer must have room for 16 elements")]
934 fn test_compress_u32x16_panics_on_small_buffer() {
935 let data = u32x16::from([
936 10, 20, 30, 40, 50, 60, 70, 80,
937 90, 100, 110, 120, 130, 140, 150, 160
938 ]);
939 let mask = 0b1000000110110010u16;
940 let mut output = [0u32; 8]; compress_store_u32x16(data, mask, &mut output);
942 }
943}