simd_lookup/lib.rs
1//! # simd-lookup
2//!
3//! High-performance SIMD utilities for fast table lookups, compression, and data processing.
4//!
5//! ## Features
6//!
7//! - **Cross-platform SIMD**: Automatic dispatch to optimal implementation (AVX-512, AVX2, NEON)
8//! - **Zero-cost abstractions**: Thin wrappers over platform intrinsics via the [`wide`] crate
9//! - **ARM NEON optimized**: Compress operations achieve up to 12 Gelem/s on Apple Silicon
10//!
11//! ## Core Modules
12//!
13//! - [`simd_compress`] — Stream compaction (VCOMPRESS): pack selected elements by bitmask
14//! - [`simd_gather`] — Parallel memory gather with SIMD indices
15//! - [`small_table`] — 64-byte lookup tables using ARM TBL4 / AVX-512 VPERMB
16//! - [`wide_utils`] — Shuffle, widen, split, and bitmask utilities for `wide` types
17//! - [`prefetch`] — Cross-platform memory prefetch (L1/L2/L3 hints)
18//!
19//!
20//! ## Quick Example
21//!
22//! ```rust
23//! use simd_lookup::{compress_store_u32x8};
24//! use wide::u32x8;
25//!
26//! // Compress: select elements where mask bits are set
27//! let data = u32x8::from([10, 20, 30, 40, 50, 60, 70, 80]);
28//! let mask = 0b10110010u8; // Select positions 1, 4, 5, 7
29//! let mut output = [0u32; 8];
30//!
31//! let count = compress_store_u32x8(data, mask, &mut output);
32//! assert_eq!(count, 4);
33//! assert_eq!(&output[..count], &[20, 50, 60, 80]);
34//! ```
35//!
36//! ## Platform Support
37//!
38//! | Platform | Optimization Level |
39//! |----------|-------------------|
40//! | ARM aarch64 (Apple Silicon) | Full NEON optimization |
41//! | x86-64 AVX-512 (Ice Lake+) | Native compress/gather |
42//! | x86-64 AVX2 | Shuffle-based fallbacks |
43//! | Other | Scalar fallbacks |
44
45pub mod bulk_vec_extender;
46pub mod eight_value_lookup;
47pub mod entropy_map_lookup;
48pub mod lookup_kernel;
49pub mod prefetch;
50pub mod simd_compress;
51pub mod simd_gather;
52pub mod small_table;
53pub mod wide_utils;
54
55// Re-export the main types for convenience
56pub use eight_value_lookup::EightValueLookup;
57pub use entropy_map_lookup::{EntropyMapBitpackedLookup, EntropyMapLookup};
58pub use lookup_kernel::{
59 PipelinedSingleTableU32U8Lookup, SimdCascadingTableU32U8Lookup, SimdDualTableWithHashLookup,
60};
61pub use simd_compress::{
62 compress_store_u32x8, compress_store_u32x16, compress_store_u8x16,
63 compress_u32x8, compress_u32x16, compress_u8x16,
64};
65pub use wide_utils::{
66 FromBitmask, SimdSplit, WideUtilsExt,
67};
68pub use simd_gather::{
69 gather_u32index_u8, gather_masked_u32index_u8,
70 gather_u32index_u32, gather_masked_u32index_u32,
71};
72
73#[cfg(test)]
74mod tests {
75 #[test]
76 fn test_cpu_features() {
77 // Check if AVX512 features are enabled at compile time
78 #[cfg(target_feature = "avx512f")]
79 {
80 println!("✓ AVX-512 Foundation (AVX512F): ENABLED");
81 }
82 #[cfg(not(target_feature = "avx512f"))]
83 {
84 println!("✗ AVX-512 Foundation (AVX512F): DISABLED");
85 }
86
87 #[cfg(target_feature = "avx512bw")]
88 {
89 println!("✓ AVX-512 Byte and Word (AVX512BW): ENABLED");
90 }
91 #[cfg(not(target_feature = "avx512bw"))]
92 {
93 println!("✗ AVX-512 Byte and Word (AVX512BW): DISABLED");
94 }
95
96 #[cfg(target_feature = "avx512vl")]
97 {
98 println!("✓ AVX-512 Vector Length (AVX512VL): ENABLED");
99 }
100 #[cfg(not(target_feature = "avx512vl"))]
101 {
102 println!("✗ AVX-512 Vector Length (AVX512VL): DISABLED");
103 }
104
105 #[cfg(target_feature = "avx2")]
106 {
107 println!("✓ AVX2: ENABLED");
108 }
109 #[cfg(not(target_feature = "avx2"))]
110 {
111 println!("✗ AVX2: DISABLED");
112 }
113
114 #[cfg(target_feature = "avx")]
115 {
116 println!("✓ AVX: ENABLED");
117 }
118 #[cfg(not(target_feature = "avx"))]
119 {
120 println!("✗ AVX: DISABLED");
121 }
122 }
123}