wow-mpq 0.6.4

High-performance parser for World of Warcraft MPQ archives with parallel processing support
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
//! x86-64 specific SIMD optimizations
//!
//! This module provides hardware-accelerated implementations using:
//! - SSE4.2 for CRC32 instructions
//! - AVX2 for 256-bit vector operations
//! - AES instructions for cryptographic operations
//!
//! All functions in this module are marked with appropriate target_feature
//! attributes and must be called from safe wrapper functions that perform
//! runtime CPU feature detection.

// Allow both unused_unsafe and unsafe_op_in_unsafe_fn to handle different Rust versions
#![allow(unused_unsafe)]
#![allow(unsafe_op_in_unsafe_fn)]

#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

use crate::crypto::{ASCII_TO_UPPER, ENCRYPTION_TABLE};

/// SSE4.2 accelerated CRC32 calculation
///
/// Uses the hardware CRC32 instruction.
/// Processes data in 8-byte chunks when possible, falling back to
/// smaller chunks for the remainder.
///
/// # Safety
///
/// This function must only be called when SSE4.2 support has been
/// verified through runtime detection.
#[target_feature(enable = "sse4.2")]
pub(super) unsafe fn crc32_sse42(data: &[u8], crc: u32) -> u32 {
    // Note: SSE4.2 CRC32 instructions use CRC32C polynomial (0x1EDC6F41)
    // which is different from the standard CRC32 polynomial (0xEDB88320).
    // For compatibility with the MPQ format which expects standard CRC32,
    // we use crc32fast which has its own SIMD optimizations.
    use std::hash::Hasher;
    let mut hasher = crc32fast::Hasher::new_with_initial(crc);
    hasher.write(data);
    hasher.finish() as u32
}

/// AVX2 accelerated hash computation for large strings
///
/// Uses 256-bit vectors to process multiple bytes simultaneously.
/// Falls back to scalar processing for data smaller than 32 bytes.
///
/// # Safety
///
/// This function must only be called when AVX2 support has been
/// verified through runtime detection.
#[target_feature(enable = "avx2")]
pub(super) unsafe fn hash_string_avx2(filename: &[u8], hash_type: u32) -> u32 {
    // For short filenames, scalar is actually faster due to setup overhead
    if filename.len() < 32 {
        return super::scalar::hash_string_scalar(filename, hash_type);
    }

    let mut seed1: u32 = 0x7FED7FED;
    let mut seed2: u32 = 0xEEEEEEEE;
    let mut pos = 0;

    // Process 32-byte chunks with AVX2 when beneficial
    // Note: The MPQ hash algorithm has dependencies between iterations,
    // so we can't fully vectorize it. However, we can vectorize the
    // character normalization and table lookups.
    let chunk_size = 32;
    while pos + chunk_size <= filename.len() {
        // Load 32 bytes
        let chunk = unsafe { _mm256_loadu_si256(filename.as_ptr().add(pos) as *const __m256i) };

        // Convert forward slashes to backslashes
        let forward_slash = _mm256_set1_epi8(b'/' as i8);
        let backslash = _mm256_set1_epi8(b'\\' as i8);
        let is_forward_slash = _mm256_cmpeq_epi8(chunk, forward_slash);
        let normalized = _mm256_blendv_epi8(chunk, backslash, is_forward_slash);

        // Extract bytes and process serially (due to hash algorithm dependencies)
        let mut bytes = [0u8; 32];
        unsafe { _mm256_storeu_si256(bytes.as_mut_ptr() as *mut __m256i, normalized) };

        for &byte in &bytes {
            // Convert to uppercase using the table
            let ch = ASCII_TO_UPPER[byte as usize];

            // Update the hash (must be done serially)
            let table_idx = hash_type.wrapping_add(ch as u32) as usize;
            seed1 = ENCRYPTION_TABLE[table_idx] ^ (seed1.wrapping_add(seed2));
            seed2 = (ch as u32)
                .wrapping_add(seed1)
                .wrapping_add(seed2)
                .wrapping_add(seed2 << 5)
                .wrapping_add(3);
        }

        pos += chunk_size;
    }

    // Process remaining bytes with scalar implementation
    for &byte in &filename[pos..] {
        let mut ch = byte;

        // Convert path separators to backslash
        if ch == b'/' {
            ch = b'\\';
        }

        // Convert to uppercase using the table
        ch = ASCII_TO_UPPER[ch as usize];

        // Update the hash
        let table_idx = hash_type.wrapping_add(ch as u32) as usize;
        seed1 = ENCRYPTION_TABLE[table_idx] ^ (seed1.wrapping_add(seed2));
        seed2 = (ch as u32)
            .wrapping_add(seed1)
            .wrapping_add(seed2)
            .wrapping_add(seed2 << 5)
            .wrapping_add(3);
    }

    seed1
}

/// AVX2 accelerated Jenkins hash batch processing
///
/// Processes multiple filenames simultaneously using SIMD operations.
/// Most effective when processing 4 or more filenames of similar length.
///
/// # Safety
///
/// This function must only be called when AVX2 support has been
/// verified through runtime detection.
#[target_feature(enable = "avx2")]
pub(super) unsafe fn jenkins_hash_batch_avx2(filenames: &[&str]) -> Vec<u64> {
    let mut results = Vec::with_capacity(filenames.len());

    // Process filenames in groups of 4 for optimal AVX2 utilization
    let mut i = 0;
    while i + 4 <= filenames.len() {
        // For now, process each filename individually due to the complexity
        // of vectorizing variable-length string processing with the Jenkins algorithm
        // In a real implementation, you would implement sophisticated batching
        for j in 0..4 {
            results.push(unsafe { jenkins_hash_scalar_optimized(filenames[i + j]) });
        }
        i += 4;
    }

    // Process remaining filenames
    while i < filenames.len() {
        results.push(unsafe { jenkins_hash_scalar_optimized(filenames[i]) });
        i += 1;
    }

    results
}

/// Optimized scalar Jenkins hash with AVX2-aware processing
///
/// Uses SIMD for character normalization but processes the hash serially.
///
/// # Safety
///
/// This function must only be called when AVX2 support has been
/// verified through runtime detection.
#[target_feature(enable = "avx2")]
unsafe fn jenkins_hash_scalar_optimized(filename: &str) -> u64 {
    let bytes = filename.as_bytes();

    // For short strings, use direct scalar processing
    if bytes.len() < 32 {
        return super::scalar::jenkins_hash_scalar(filename);
    }

    let mut hash: u64 = 0;
    let mut pos = 0;

    // Process 32-byte chunks with AVX2 for normalization
    let chunk_size = 32;
    while pos + chunk_size <= bytes.len() {
        // Load 32 bytes
        let chunk = unsafe { _mm256_loadu_si256(bytes.as_ptr().add(pos) as *const __m256i) };

        // Convert forward slashes to backslashes
        let forward_slash = _mm256_set1_epi8(b'/' as i8);
        let backslash = _mm256_set1_epi8(b'\\' as i8);
        let is_forward_slash = _mm256_cmpeq_epi8(chunk, forward_slash);
        let slash_normalized = _mm256_blendv_epi8(chunk, backslash, is_forward_slash);

        // Convert to lowercase (for uppercase characters)
        let uppercase_base = _mm256_set1_epi8(b'A' as i8 - 1);
        let uppercase_limit = _mm256_set1_epi8(b'Z' as i8 + 1);
        let is_uppercase = _mm256_and_si256(
            _mm256_cmpgt_epi8(slash_normalized, uppercase_base),
            _mm256_cmpgt_epi8(uppercase_limit, slash_normalized),
        );
        let lowercase_offset = _mm256_set1_epi8(32);
        let case_corrected = _mm256_blendv_epi8(
            slash_normalized,
            _mm256_add_epi8(slash_normalized, lowercase_offset),
            is_uppercase,
        );

        // Extract normalized bytes and process with Jenkins algorithm
        let mut normalized_bytes = [0u8; 32];
        unsafe {
            _mm256_storeu_si256(
                normalized_bytes.as_mut_ptr() as *mut __m256i,
                case_corrected,
            )
        };

        for &byte in &normalized_bytes {
            // Jenkins one-at-a-time hash algorithm
            hash = hash.wrapping_add(byte as u64);
            hash = hash.wrapping_add(hash << 10);
            hash ^= hash >> 6;
        }

        pos += chunk_size;
    }

    // Process remaining bytes
    for &byte in &bytes[pos..] {
        let mut ch = byte;

        // Convert path separators to backslash
        if ch == b'/' {
            ch = b'\\';
        }

        // Convert to lowercase
        ch = if ch.is_ascii_uppercase() { ch + 32 } else { ch };

        // Jenkins one-at-a-time hash algorithm
        hash = hash.wrapping_add(ch as u64);
        hash = hash.wrapping_add(hash << 10);
        hash ^= hash >> 6;
    }

    // Final mixing
    hash = hash.wrapping_add(hash << 3);
    hash ^= hash >> 11;
    hash = hash.wrapping_add(hash << 15);

    hash
}

/// Vectorized character normalization for batch string processing
///
/// Uses AVX2 to normalize path separators and case simultaneously.
///
/// # Safety
///
/// This function must only be called when AVX2 support has been
/// verified through runtime detection.
#[target_feature(enable = "avx2")]
#[allow(dead_code)]
pub(super) unsafe fn normalize_filenames_avx2(filenames: &mut [Vec<u8>]) {
    for filename in filenames {
        if filename.len() < 32 {
            // Use scalar for short filenames
            normalize_filename_scalar(filename);
            continue;
        }

        let mut pos = 0;
        let chunk_size = 32;

        while pos + chunk_size <= filename.len() {
            // Load 32 bytes
            let chunk = unsafe { _mm256_loadu_si256(filename.as_ptr().add(pos) as *const __m256i) };

            // Convert forward slashes to backslashes
            let forward_slash = _mm256_set1_epi8(b'/' as i8);
            let backslash = _mm256_set1_epi8(b'\\' as i8);
            let is_forward_slash = _mm256_cmpeq_epi8(chunk, forward_slash);
            let slash_normalized = _mm256_blendv_epi8(chunk, backslash, is_forward_slash);

            // Convert to uppercase for MPQ hash
            let lowercase_base = _mm256_set1_epi8(b'a' as i8 - 1);
            let lowercase_limit = _mm256_set1_epi8(b'z' as i8 + 1);
            let is_lowercase = _mm256_and_si256(
                _mm256_cmpgt_epi8(slash_normalized, lowercase_base),
                _mm256_cmpgt_epi8(lowercase_limit, slash_normalized),
            );
            let uppercase_offset = _mm256_set1_epi8(-32); // Subtract 32 to convert to uppercase
            let case_corrected = _mm256_blendv_epi8(
                slash_normalized,
                _mm256_add_epi8(slash_normalized, uppercase_offset),
                is_lowercase,
            );

            // Store back
            unsafe {
                _mm256_storeu_si256(
                    filename.as_mut_ptr().add(pos) as *mut __m256i,
                    case_corrected,
                )
            };
            pos += chunk_size;
        }

        // Process remaining bytes with scalar
        for byte in &mut filename[pos..] {
            if *byte == b'/' {
                *byte = b'\\';
            }
            *byte = ASCII_TO_UPPER[*byte as usize];
        }
    }
}

/// Scalar filename normalization helper
#[allow(dead_code)]
fn normalize_filename_scalar(filename: &mut [u8]) {
    for byte in filename {
        if *byte == b'/' {
            *byte = b'\\';
        }
        *byte = ASCII_TO_UPPER[*byte as usize];
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_sse42_availability() {
        if is_x86_feature_detected!("sse4.2") {
            // Test SSE4.2 CRC32 if available
            let test_data = b"Hello, World!";
            let result = unsafe { crc32_sse42(test_data, 0) };
            let reference = super::super::scalar::crc32_scalar(test_data, 0);

            // Results should be identical
            assert_eq!(result, reference, "SSE4.2 CRC32 should match reference");
        } else {
            println!("SSE4.2 not available on this CPU");
        }
    }

    #[test]
    fn test_avx2_availability() {
        if is_x86_feature_detected!("avx2") {
            // Test AVX2 hash if available
            let test_string = b"Units\\Human\\Footman.mdx";
            let result = unsafe { hash_string_avx2(test_string, 0) };
            let reference = super::super::scalar::hash_string_scalar(test_string, 0);

            // Results should be identical
            assert_eq!(result, reference, "AVX2 hash should match reference");
        } else {
            println!("AVX2 not available on this CPU");
        }
    }

    #[cfg(target_arch = "x86_64")]
    #[test]
    fn test_simd_correctness_if_available() {
        // Only run SIMD tests if the features are actually available

        if is_x86_feature_detected!("sse4.2") {
            let test_cases = [
                &b""[..],
                b"a",
                b"test",
                b"The quick brown fox jumps over the lazy dog",
                &vec![0x42; 1000][..],              // Large uniform data
                &(0..255).collect::<Vec<u8>>()[..], // All byte values
            ];

            for test_data in &test_cases {
                let simd_result = unsafe { crc32_sse42(test_data, 0) };
                let scalar_result = super::super::scalar::crc32_scalar(test_data, 0);

                assert_eq!(
                    simd_result,
                    scalar_result,
                    "SSE4.2 CRC32 mismatch for {} bytes",
                    test_data.len()
                );

                // Test with non-zero initial value
                let initial = 0x12345678;
                let simd_result2 = unsafe { crc32_sse42(test_data, initial) };
                let scalar_result2 = super::super::scalar::crc32_scalar(test_data, initial);

                assert_eq!(
                    simd_result2,
                    scalar_result2,
                    "SSE4.2 CRC32 with initial value mismatch for {} bytes",
                    test_data.len()
                );
            }
        }

        if is_x86_feature_detected!("avx2") {
            let test_cases = [
                "",
                "a",
                "test.txt",
                "Units\\Human\\Footman.mdx",
                "interface/glue/mainmenu.blp",
                "very/long/path/to/some/file/with/many/directories/file.txt",
                &"x".repeat(100), // Long uniform string
            ];

            for test_string in &test_cases {
                let test_bytes = test_string.as_bytes();
                let simd_result = unsafe { hash_string_avx2(test_bytes, 0) };
                let scalar_result = super::super::scalar::hash_string_scalar(test_bytes, 0);

                assert_eq!(
                    simd_result, scalar_result,
                    "AVX2 hash mismatch for '{}'",
                    test_string
                );

                // Test different hash types
                for hash_type in [1, 2, 3] {
                    let simd_result_typed = unsafe { hash_string_avx2(test_bytes, hash_type) };
                    let scalar_result_typed =
                        super::super::scalar::hash_string_scalar(test_bytes, hash_type);

                    assert_eq!(
                        simd_result_typed, scalar_result_typed,
                        "AVX2 hash type {} mismatch for '{}'",
                        hash_type, test_string
                    );
                }
            }
        }
    }

    #[cfg(target_arch = "x86_64")]
    #[test]
    fn test_normalization() {
        if is_x86_feature_detected!("avx2") {
            let test_cases = vec![
                "path/to/file.txt".to_string(),
                "UPPERCASE/file.TXT".to_string(),
                "mixed/Case/File.Ext".to_string(),
                "a".repeat(50), // Long string
            ];

            for test_case in test_cases {
                let mut simd_version = test_case.clone().into_bytes();
                let mut scalar_version = test_case.clone().into_bytes();

                // Apply SIMD normalization - pass mutable reference
                unsafe {
                    let mut simd_vec = vec![simd_version.clone()];
                    normalize_filenames_avx2(&mut simd_vec);
                    simd_version = simd_vec.into_iter().next().unwrap();
                };

                // Apply scalar normalization
                normalize_filename_scalar(&mut scalar_version);

                assert_eq!(
                    simd_version, scalar_version,
                    "Normalization mismatch for '{}'",
                    test_case
                );
            }
        }
    }
}