wow_mpq/simd/
mod.rs

1//! SIMD-accelerated operations with runtime CPU detection
2//!
3//! This module provides hardware-accelerated versions of performance-critical operations
4//! with automatic fallback to scalar implementations.
5//!
6//! ## Features
7//!
8//! - **Runtime CPU Detection**: Automatic detection of available SIMD instruction sets
9//! - **CRC32 Acceleration**: Hardware-accelerated CRC32 using SSE4.2 on x86-64
10//! - **Hash Acceleration**: SIMD-optimized hash functions for large-scale operations
11//! - **Cross-Platform Support**: Optimized implementations for x86-64 and ARM64
12//! - **Safe Fallbacks**: Always provides scalar fallback implementations
13//!
14//! ## Performance Targets
15//!
16//! - **CRC32**: 3-5x faster with SSE4.2 hardware acceleration
17//! - **Hash Operations**: 2-4x faster with vectorized processing
18//! - **Large Archives**: Significant improvements for Cataclysm/MoP size archives
19//! - **Bulk Processing**: 20-40% overall improvement for multi-file operations
20//!
21//! ## Examples
22//!
23//! ```no_run
24//! use wow_mpq::simd::SimdOps;
25//!
26//! let simd = SimdOps::new();
27//!
28//! // Hardware-accelerated CRC32
29//! let crc = simd.crc32(b"test data", 0);
30//!
31//! // SIMD-accelerated hash for large batches
32//! let hash = simd.hash_string_simd(b"filename.mdx", 0);
33//! ```
34
35#[cfg(target_arch = "aarch64")]
36mod aarch64;
37#[cfg(target_arch = "x86_64")]
38mod x86_64;
39
40pub mod scalar; // Fallback implementations
41
42/// CPU capabilities detected at runtime
43#[derive(Debug, Clone)]
44pub struct CpuFeatures {
45    /// SSE4.2 support for CRC32 instructions
46    pub has_sse42: bool,
47    /// AVX2 support for 256-bit vector operations
48    pub has_avx2: bool,
49    /// AES instructions for cryptographic operations
50    pub has_aes: bool,
51    /// PCLMULQDQ for carryless multiplication
52    pub has_pclmulqdq: bool,
53    /// ARM NEON support (ARM64 only)
54    #[cfg(target_arch = "aarch64")]
55    pub has_neon: bool,
56}
57
58impl Default for CpuFeatures {
59    fn default() -> Self {
60        detect_cpu_features()
61    }
62}
63
64/// SIMD-optimized operations interface
65#[derive(Debug)]
66pub struct SimdOps {
67    features: CpuFeatures,
68}
69
70impl SimdOps {
71    /// Create new SIMD operations with runtime CPU detection
72    pub fn new() -> Self {
73        Self {
74            features: detect_cpu_features(),
75        }
76    }
77
78    /// Get detected CPU features
79    pub fn features(&self) -> &CpuFeatures {
80        &self.features
81    }
82
83    /// Hardware-accelerated CRC32 calculation
84    ///
85    /// Uses SSE4.2 CRC32 instruction when available, falls back to scalar implementation.
86    /// Processes data in 8-byte chunks for maximum efficiency.
87    pub fn crc32(&self, data: &[u8], initial: u32) -> u32 {
88        #[cfg(target_arch = "x86_64")]
89        {
90            if self.features.has_sse42 {
91                // Use hardware-accelerated CRC32
92                return unsafe { x86_64::crc32_sse42(data, initial) };
93            }
94        }
95
96        // Fall back to scalar implementation
97        scalar::crc32_scalar(data, initial)
98    }
99
100    /// SIMD-accelerated hash computation for file lookups
101    ///
102    /// Optimizes hash computation for batch processing of multiple filenames.
103    /// Uses AVX2 on x86-64 or NEON on ARM64 when available.
104    pub fn hash_string_simd(&self, data: &[u8], hash_type: u32) -> u32 {
105        #[cfg(target_arch = "x86_64")]
106        {
107            if self.features.has_avx2 && data.len() >= 32 {
108                // Use AVX2 for large strings
109                return unsafe { x86_64::hash_string_avx2(data, hash_type) };
110            }
111        }
112
113        #[cfg(target_arch = "aarch64")]
114        {
115            if self.features.has_neon && data.len() >= 16 {
116                // Use NEON for ARM64
117                return unsafe { aarch64::hash_string_neon(data, hash_type) };
118            }
119        }
120
121        // Fall back to scalar implementation
122        scalar::hash_string_scalar(data, hash_type)
123    }
124
125    /// SIMD-accelerated Jenkins hash for batch processing
126    ///
127    /// Optimizes Jenkins one-at-a-time hash for processing multiple files.
128    pub fn jenkins_hash_batch(&self, filenames: &[&str]) -> Vec<u64> {
129        let mut results = Vec::with_capacity(filenames.len());
130
131        #[cfg(target_arch = "x86_64")]
132        {
133            if self.features.has_avx2 && filenames.len() >= 4 {
134                // Process 4 filenames at once with AVX2
135                return unsafe { x86_64::jenkins_hash_batch_avx2(filenames) };
136            }
137        }
138
139        // Process one by one with scalar fallback
140        for filename in filenames {
141            results.push(scalar::jenkins_hash_scalar(filename));
142        }
143
144        results
145    }
146
147    /// Check if any SIMD optimizations are available
148    pub fn has_simd_support(&self) -> bool {
149        self.features.has_sse42
150            || self.features.has_avx2
151            || self.features.has_aes
152            || self.features.has_pclmulqdq
153            || {
154                #[cfg(target_arch = "aarch64")]
155                {
156                    self.features.has_neon
157                }
158                #[cfg(not(target_arch = "aarch64"))]
159                {
160                    false
161                }
162            }
163    }
164}
165
166impl Default for SimdOps {
167    fn default() -> Self {
168        Self::new()
169    }
170}
171
172/// Runtime CPU feature detection
173fn detect_cpu_features() -> CpuFeatures {
174    #[cfg(target_arch = "x86_64")]
175    {
176        CpuFeatures {
177            has_sse42: is_x86_feature_detected!("sse4.2"),
178            has_avx2: is_x86_feature_detected!("avx2"),
179            has_aes: is_x86_feature_detected!("aes"),
180            has_pclmulqdq: is_x86_feature_detected!("pclmulqdq"),
181        }
182    }
183
184    #[cfg(target_arch = "aarch64")]
185    {
186        CpuFeatures {
187            has_sse42: false,
188            has_avx2: false,
189            has_aes: is_aarch64_feature_detected!("aes"),
190            has_pclmulqdq: is_aarch64_feature_detected!("pmull"),
191            has_neon: is_aarch64_feature_detected!("neon"),
192        }
193    }
194
195    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
196    {
197        CpuFeatures {
198            has_sse42: false,
199            has_avx2: false,
200            has_aes: false,
201            has_pclmulqdq: false,
202        }
203    }
204}
205
206#[cfg(test)]
207mod tests {
208    use super::*;
209
210    #[test]
211    fn test_simd_ops_creation() {
212        let simd = SimdOps::new();
213        let features = simd.features();
214
215        // Should successfully detect features without crashing
216        println!("Detected SIMD features:");
217        println!("  SSE4.2: {}", features.has_sse42);
218        println!("  AVX2: {}", features.has_avx2);
219        println!("  AES: {}", features.has_aes);
220        println!("  PCLMULQDQ: {}", features.has_pclmulqdq);
221
222        #[cfg(target_arch = "aarch64")]
223        println!("  NEON: {}", features.has_neon);
224
225        // Should indicate if any SIMD support is available
226        let has_simd = simd.has_simd_support();
227        println!("  SIMD supported: {}", has_simd);
228    }
229
230    #[test]
231    fn test_crc32_simd_correctness() {
232        let simd = SimdOps::new();
233        let test_data = b"The quick brown fox jumps over the lazy dog";
234
235        let simd_result = simd.crc32(test_data, 0);
236        let scalar_result = scalar::crc32_scalar(test_data, 0);
237
238        // SIMD and scalar results should match
239        assert_eq!(
240            simd_result, scalar_result,
241            "SIMD CRC32 should match scalar implementation"
242        );
243
244        // Test with different initial values
245        let simd_result2 = simd.crc32(test_data, 0x12345678);
246        let scalar_result2 = scalar::crc32_scalar(test_data, 0x12345678);
247
248        assert_eq!(
249            simd_result2, scalar_result2,
250            "SIMD CRC32 with initial value should match scalar"
251        );
252    }
253
254    #[test]
255    fn test_hash_string_simd_correctness() {
256        let simd = SimdOps::new();
257        let test_string = b"Units\\Human\\Footman.mdx";
258
259        let simd_result = simd.hash_string_simd(test_string, 0);
260        let scalar_result = scalar::hash_string_scalar(test_string, 0);
261
262        // SIMD and scalar results should match
263        assert_eq!(
264            simd_result, scalar_result,
265            "SIMD hash should match scalar implementation"
266        );
267
268        // Test with different hash types
269        for hash_type in [0, 1, 2, 3] {
270            let simd_result = simd.hash_string_simd(test_string, hash_type);
271            let scalar_result = scalar::hash_string_scalar(test_string, hash_type);
272
273            assert_eq!(
274                simd_result, scalar_result,
275                "SIMD hash type {} should match scalar",
276                hash_type
277            );
278        }
279    }
280
281    #[test]
282    fn test_jenkins_hash_batch() {
283        let simd = SimdOps::new();
284        let filenames = [
285            "file1.txt",
286            "file2.txt",
287            "file3.txt",
288            "file4.txt",
289            "file5.txt",
290        ];
291
292        let batch_result = simd.jenkins_hash_batch(&filenames);
293        assert_eq!(batch_result.len(), filenames.len());
294
295        // Verify each result matches scalar implementation
296        for (i, filename) in filenames.iter().enumerate() {
297            let scalar_result = scalar::jenkins_hash_scalar(filename);
298            assert_eq!(
299                batch_result[i], scalar_result,
300                "Batch Jenkins hash for '{}' should match scalar",
301                filename
302            );
303        }
304    }
305
306    #[test]
307    fn test_empty_input_handling() {
308        let simd = SimdOps::new();
309
310        // Empty data should work without crashing
311        let empty_crc = simd.crc32(&[], 0);
312        let scalar_empty_crc = scalar::crc32_scalar(&[], 0);
313        assert_eq!(empty_crc, scalar_empty_crc);
314
315        let empty_hash = simd.hash_string_simd(&[], 0);
316        let scalar_empty_hash = scalar::hash_string_scalar(&[], 0);
317        assert_eq!(empty_hash, scalar_empty_hash);
318
319        // Empty batch should return empty results
320        let empty_batch = simd.jenkins_hash_batch(&[]);
321        assert!(empty_batch.is_empty());
322    }
323
324    #[test]
325    fn test_feature_detection_stability() {
326        // Feature detection should be stable across multiple calls
327        let features1 = detect_cpu_features();
328        let features2 = detect_cpu_features();
329
330        assert_eq!(features1.has_sse42, features2.has_sse42);
331        assert_eq!(features1.has_avx2, features2.has_avx2);
332        assert_eq!(features1.has_aes, features2.has_aes);
333        assert_eq!(features1.has_pclmulqdq, features2.has_pclmulqdq);
334
335        #[cfg(target_arch = "aarch64")]
336        assert_eq!(features1.has_neon, features2.has_neon);
337    }
338}