zipora 3.1.5

High-performance Rust implementation providing advanced data structures and compression algorithms with memory safety guarantees. Features LRU page cache, sophisticated caching layer, fiber-based concurrency, real-time compression, secure memory pools, SIMD optimizations, and complete C FFI for migration from C++.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
//! # Enhanced CPU Feature Detection System
//!
//! Comprehensive runtime CPU feature detection with adaptive algorithm selection.
//! Inspired by production-grade feature detection systems and high-performance
//! libraries with additional Rust-specific optimizations.
//!
//! # Architecture
//!
//! This module implements sophisticated hardware detection following Phase 1.1 of the
//! systematic SIMD implementation plan:
//! - Enhanced x86_64 feature detection (SSE4.1/4.2, AVX, AVX2, AVX-512, BMI1/2, etc.)
//! - Comprehensive ARM64 feature detection (NEON, CRC32, Crypto, SVE)
//! - Cache characteristics detection with accurate sizing
//! - Build system integration with feature flags
//! - Runtime optimal algorithm selection

use std::sync::OnceLock;
use std::collections::HashMap;

/// Comprehensive CPU feature flags for runtime detection
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum CpuFeature {
    // x86_64 Basic SSE/AVX features
    SSE2,
    SSE3,
    SSSE3,
    SSE4_1,
    SSE4_2,
    AVX,
    AVX2,
    
    // x86_64 BMI and specialized instructions
    BMI1,
    BMI2,
    POPCNT,
    LZCNT,
    TZCNT,
    PREFETCHW,
    
    // x86_64 Crypto and specialized features
    PCLMULQDQ,
    AES,
    RDRAND,
    RDSEED,
    
    // x86_64 AVX-512 feature family
    AVX512F,        // Foundation
    AVX512DQ,       // Doubleword and Quadword Instructions
    AVX512CD,       // Conflict Detection Instructions
    AVX512BW,       // Byte and Word Instructions
    AVX512VL,       // Vector Length Extensions
    AVX512VPOPCNTDQ, // Vector Population Count D/Q
    AVX512VBMI,     // Vector Bit Manipulation Instructions
    AVX512IFMA,     // Integer Fused Multiply-Add
    
    // ARM64 features
    NEON,
    CRC32,
    AesArm,
    SHA1,
    SHA2,
    SHA3,
    Crypto,
    SVE,            // Scalable Vector Extension
    SVE2,           // Scalable Vector Extension 2
    
    // Universal features
    UnalignedAccess,
    
    // Memory and cache features
    Prefetch,
    ClflushOpt,
    Clwb,
}

/// Primary CPU feature set with comprehensive performance characteristics
/// 
/// This is the main CPU features interface, providing comprehensive hardware detection
/// and optimization strategy selection for SIMD operations.
#[derive(Debug, Clone)]
pub struct CpuFeatures {
    // x86_64 SSE/AVX features
    pub has_sse41: bool,
    pub has_sse42: bool,
    pub has_avx: bool,
    pub has_avx2: bool,
    pub has_avx512f: bool,
    pub has_avx512vl: bool,
    pub has_avx512bw: bool,
    pub has_avx512vpopcntdq: bool,
    
    // x86_64 BMI and specialized instructions
    pub has_bmi1: bool,
    pub has_bmi2: bool,
    pub has_popcnt: bool,
    pub has_lzcnt: bool,
    pub has_tzcnt: bool,
    pub has_prefetchw: bool,
    
    // ARM64 features
    pub has_neon: bool,
    pub has_crc32: bool,
    pub has_crypto: bool,
    pub has_sve: bool,
    pub has_sve2: bool,
    
    // Cache characteristics
    pub l1_cache_size: usize,
    pub l2_cache_size: usize,
    pub l3_cache_size: usize,
    pub cache_line_size: usize,
    
    // System characteristics
    pub logical_cores: usize,
    pub physical_cores: usize,
    pub vendor: String,
    pub model: String,
    
    // Performance optimization tier
    pub optimization_tier: u8,
    pub simd_tier: u8,
}

/// Legacy CPU feature set for backward compatibility
#[derive(Debug, Clone)]
pub struct CpuFeatureSet {
    /// Available CPU features
    pub features: HashMap<CpuFeature, bool>,
    /// CPU vendor (Intel, AMD, ARM, etc.)
    pub vendor: String,
    /// CPU model name
    pub model: String,
    /// Number of logical cores
    pub logical_cores: usize,
    /// Number of physical cores
    pub physical_cores: usize,
    /// Cache line size (typically 64 bytes)
    pub cache_line_size: usize,
    /// L1 cache size (data)
    pub l1_cache_size: usize,
    /// L2 cache size
    pub l2_cache_size: usize,
    /// L3 cache size
    pub l3_cache_size: usize,
    /// SIMD optimization tier (0=scalar, 1=basic, 2=advanced, 3=cutting-edge)
    pub simd_tier: u8,
}

impl CpuFeatures {
    /// Create new AdvancedCpuFeatures with all features disabled
    pub fn new() -> Self {
        Self {
            // x86_64 SSE/AVX features
            has_sse41: false,
            has_sse42: false,
            has_avx: false,
            has_avx2: false,
            has_avx512f: false,
            has_avx512vl: false,
            has_avx512bw: false,
            has_avx512vpopcntdq: false,
            
            // x86_64 BMI and specialized instructions
            has_bmi1: false,
            has_bmi2: false,
            has_popcnt: false,
            has_lzcnt: false,
            has_tzcnt: false,
            has_prefetchw: false,
            
            // ARM64 features
            has_neon: false,
            has_crc32: false,
            has_crypto: false,
            has_sve: false,
            has_sve2: false,
            
            // Cache characteristics (default values)
            l1_cache_size: 32 * 1024,    // 32KB
            l2_cache_size: 256 * 1024,   // 256KB  
            l3_cache_size: 8 * 1024 * 1024, // 8MB
            cache_line_size: 64,         // 64 bytes
            
            // System characteristics
            logical_cores: 1,
            physical_cores: 1,
            vendor: String::new(),
            model: String::new(),
            
            // Performance optimization tier
            optimization_tier: 0,
            simd_tier: 0,
        }
    }
    
    /// Detect and configure SIMD optimization strategy
    pub fn detect_and_configure_simd(&mut self) {
        // Determine optimization tier based on available features
        self.optimization_tier = self.calculate_optimization_tier();
        self.simd_tier = self.calculate_simd_tier();
    }
    
    /// Calculate optimization tier
    fn calculate_optimization_tier(&self) -> u8 {
        if self.has_avx512f && self.has_avx512bw && self.has_avx512vpopcntdq {
            5 // Tier 5: AVX-512 with popcount
        } else if self.has_avx2 && self.has_bmi2 {
            4 // Tier 4: AVX2 + BMI2 
        } else if self.has_bmi2 {
            3 // Tier 3: BMI2
        } else if self.has_popcnt || self.has_neon {
            2 // Tier 2: POPCNT or NEON
        } else {
            1 // Tier 1: Scalar fallback
        }
    }
    
    /// Calculate SIMD tier
    fn calculate_simd_tier(&self) -> u8 {
        if self.has_avx512f {
            4  // Cutting-edge: AVX-512
        } else if self.has_bmi2 && self.has_avx2 {
            3  // Advanced: BMI2 + AVX2
        } else if self.has_avx2 {
            2  // Intermediate: AVX2
        } else if self.has_popcnt || self.has_neon {
            1  // Basic: POPCNT or NEON
        } else {
            0  // Scalar fallback
        }
    }
    
    /// Get optimal SIMD implementation for rank/select operations
    pub fn optimal_rank_select_variant(&self) -> &'static str {
        if self.has_avx512f && self.has_avx512bw && self.has_avx512vpopcntdq {
            "avx512_popcnt"
        } else if self.has_bmi2 && self.has_avx2 {
            "bmi2_avx2"
        } else if self.has_avx2 {
            "avx2"
        } else if self.has_bmi2 {
            "bmi2"
        } else if self.has_popcnt {
            "popcnt"
        } else if self.has_neon {
            "neon"
        } else {
            "scalar"
        }
    }
    
    /// Get optimal string search implementation
    pub fn optimal_string_search_variant(&self) -> &'static str {
        if self.has_sse42 {
            "sse42_pcmpestri"
        } else if self.has_avx2 {
            "avx2_search"
        } else if self.has_neon {
            "neon_search"
        } else {
            "scalar"
        }
    }
    
    /// Get optimal memory copy implementation
    pub fn optimal_memcpy_variant(&self) -> &'static str {
        if self.has_avx512f {
            "avx512_memcpy"
        } else if self.has_avx2 {
            "avx2_memcpy"
        } else if self.has_neon {
            "neon_memcpy"
        } else {
            "scalar_memcpy"
        }
    }
    
    /// Get optimal Base64 implementation
    pub fn optimal_base64_variant(&self) -> &'static str {
        if self.has_avx2 {
            "avx2"
        } else if self.has_sse42 {
            "sse42"
        } else if self.has_neon {
            "neon"
        } else {
            "scalar"
        }
    }
    
    /// Get recommended chunk size for bulk operations
    pub fn recommended_chunk_size(&self) -> usize {
        match self.optimization_tier {
            5 => 64 * 1024,  // AVX-512: 64KB chunks
            4 => 32 * 1024,  // AVX2+BMI2: 32KB chunks
            3 => 16 * 1024,  // BMI2: 16KB chunks
            2 => 8 * 1024,   // Basic SIMD: 8KB chunks
            _ => 4 * 1024,   // Scalar: 4KB chunks
        }
    }
    
    /// Check if prefetching should be used
    pub fn should_use_prefetch(&self) -> bool {
        self.has_prefetchw || self.optimization_tier >= 3
    }
    
    /// Check if hardware has optimal memory access patterns
    pub fn has_optimal_memory_access(&self) -> bool {
        // Check for features that indicate good memory performance
        self.cache_line_size == 64 && 
        (cfg!(target_arch = "x86_64") || cfg!(target_arch = "aarch64"))
    }
    
    /// Get recommended memory alignment for SIMD operations
    pub fn recommended_alignment(&self) -> usize {
        if self.has_avx512f {
            64  // 512-bit alignment
        } else if self.has_avx2 {
            32  // 256-bit alignment
        } else if self.has_neon {
            16  // 128-bit alignment
        } else {
            8   // 64-bit alignment
        }
    }

    /// Check if a specific CPU feature is available
    pub fn has_feature(&self, feature: CpuFeature) -> bool {
        match feature {
            // x86_64 Basic SSE/AVX features
            CpuFeature::SSE2 => true, // Always available on x86_64
            CpuFeature::SSE3 => true, // Commonly available
            CpuFeature::SSSE3 => true, // Commonly available
            CpuFeature::SSE4_1 => self.has_sse41,
            CpuFeature::SSE4_2 => self.has_sse42,
            CpuFeature::AVX => self.has_avx,
            CpuFeature::AVX2 => self.has_avx2,
            
            // x86_64 BMI and specialized instructions
            CpuFeature::BMI1 => self.has_bmi1,
            CpuFeature::BMI2 => self.has_bmi2,
            CpuFeature::POPCNT => self.has_popcnt,
            CpuFeature::LZCNT => self.has_lzcnt,
            CpuFeature::TZCNT => self.has_tzcnt,
            CpuFeature::PREFETCHW => self.has_prefetchw,
            
            // x86_64 AVX-512 feature family
            CpuFeature::AVX512F => self.has_avx512f,
            CpuFeature::AVX512VL => self.has_avx512vl,
            CpuFeature::AVX512BW => self.has_avx512bw,
            CpuFeature::AVX512VPOPCNTDQ => self.has_avx512vpopcntdq,
            
            // ARM64 features
            CpuFeature::NEON => self.has_neon,
            CpuFeature::CRC32 => self.has_crc32,
            CpuFeature::Crypto => self.has_crypto,
            CpuFeature::SVE => self.has_sve,
            CpuFeature::SVE2 => self.has_sve2,
            
            // Universal features
            CpuFeature::UnalignedAccess => {
                #[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
                {
                    true // x86_64 and aarch64 support unaligned access
                }
                #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
                {
                    false
                }
            }
            
            // Features not currently tracked in CpuFeatures struct
            _ => false,
        }
    }
}

impl Default for CpuFeatures {
    fn default() -> Self {
        Self::new()
    }
}

impl CpuFeatureSet {
    /// Check if a specific feature is available
    pub fn has_feature(&self, feature: CpuFeature) -> bool {
        self.features.get(&feature).copied().unwrap_or(false)
    }

    /// Get the optimal SIMD instruction set for rank/select operations
    pub fn optimal_rank_select_variant(&self) -> &'static str {
        if self.has_feature(CpuFeature::AVX512F) && self.has_feature(CpuFeature::AVX512BW) {
            "avx512"
        } else if self.has_feature(CpuFeature::BMI2) && self.has_feature(CpuFeature::AVX2) {
            "bmi2_avx2"
        } else if self.has_feature(CpuFeature::AVX2) {
            "avx2"
        } else if self.has_feature(CpuFeature::POPCNT) {
            "popcnt"
        } else if self.has_feature(CpuFeature::NEON) {
            "neon"
        } else {
            "scalar"
        }
    }

    /// Get the optimal Base64 implementation
    pub fn optimal_base64_variant(&self) -> &'static str {
        if self.has_feature(CpuFeature::AVX2) {
            "avx2"
        } else if self.has_feature(CpuFeature::SSE4_2) {
            "sse42"
        } else if self.has_feature(CpuFeature::NEON) {
            "neon"
        } else {
            "scalar"
        }
    }

    /// Get the SIMD optimization tier
    pub fn get_simd_tier(&self) -> u8 {
        if self.has_feature(CpuFeature::AVX512F) {
            4  // Cutting-edge: AVX-512
        } else if self.has_feature(CpuFeature::BMI2) && self.has_feature(CpuFeature::AVX2) {
            3  // Advanced: BMI2 + AVX2
        } else if self.has_feature(CpuFeature::AVX2) {
            2  // Intermediate: AVX2
        } else if self.has_feature(CpuFeature::POPCNT) || self.has_feature(CpuFeature::NEON) {
            1  // Basic: POPCNT or NEON
        } else {
            0  // Scalar fallback
        }
    }

    /// Check if hardware has optimal memory access patterns
    pub fn has_optimal_memory_access(&self) -> bool {
        // Check for features that indicate good memory performance
        self.cache_line_size == 64 && 
        (self.has_feature(CpuFeature::UnalignedAccess) || cfg!(target_arch = "x86_64"))
    }

    /// Get recommended buffer alignment for SIMD operations
    pub fn recommended_alignment(&self) -> usize {
        if self.has_feature(CpuFeature::AVX512F) {
            64  // 512-bit alignment
        } else if self.has_feature(CpuFeature::AVX2) {
            32  // 256-bit alignment
        } else if self.has_feature(CpuFeature::SSE2) || self.has_feature(CpuFeature::NEON) {
            16  // 128-bit alignment
        } else {
            8   // 64-bit alignment
        }
    }
}

/// Runtime CPU feature detection interface
pub struct RuntimeCpuFeatures;

impl RuntimeCpuFeatures {
    /// Create a new runtime feature detector
    pub fn new() -> Self {
        Self
    }

    /// Detect all available CPU features
    pub fn detect_features(&self) -> CpuFeatures {
        let mut features = CpuFeatures::new();
        
        // Detect CPU info
        let (vendor, model) = self.get_cpu_info();
        features.vendor = vendor;
        features.model = model;
        
        // Detect core counts
        let (logical_cores, physical_cores) = self.get_core_count();
        features.logical_cores = logical_cores;
        features.physical_cores = physical_cores;
        
        // Detect cache info with enhanced detection
        let cache_info = self.get_enhanced_cache_info();
        features.cache_line_size = cache_info.0;
        features.l1_cache_size = cache_info.1;
        features.l2_cache_size = cache_info.2;
        features.l3_cache_size = cache_info.3;
        
        // Platform-specific feature detection
        #[cfg(target_arch = "x86_64")]
        {
            self.detect_x86_features(&mut features);
        }
        
        #[cfg(target_arch = "aarch64")]
        {
            self.detect_arm_features(&mut features);
        }
        
        // Configure SIMD optimization strategy
        features.detect_and_configure_simd();
        
        features
    }
    
    /// Enhanced x86_64 feature detection for CpuFeatures
    #[cfg(target_arch = "x86_64")]
    fn detect_x86_features(&self, features: &mut CpuFeatures) {
        let cpuid = raw_cpuid::CpuId::new();
        
        // Basic features
        if let Some(feature_info) = cpuid.get_feature_info() {
            features.has_sse41 = feature_info.has_sse41();
            features.has_sse42 = feature_info.has_sse42();
            features.has_avx = feature_info.has_avx();
            features.has_popcnt = feature_info.has_popcnt();
        }
        
        // Extended features
        if let Some(extended_features) = cpuid.get_extended_feature_info() {
            features.has_avx2 = extended_features.has_avx2();
            features.has_bmi1 = extended_features.has_bmi1();
            features.has_bmi2 = extended_features.has_bmi2();
            // Note: prefetchw detection varies by CPU architecture
            features.has_prefetchw = false; // Default to false for compatibility
            
            // AVX-512 features
            features.has_avx512f = extended_features.has_avx512f();
            features.has_avx512vl = extended_features.has_avx512vl();
            features.has_avx512bw = extended_features.has_avx512bw();
            
            // Check for AVX-512 VPOPCNTDQ through extended features
            // This is a more advanced feature that might not be in basic detection
            features.has_avx512vpopcntdq = false; // Default to false for compatibility
        }
        
        // Extended processor info
        if let Some(extended_info) = cpuid.get_extended_processor_and_feature_identifiers() {
            features.has_lzcnt = extended_info.has_lzcnt();
            // TZCNT is typically available with BMI1
            features.has_tzcnt = features.has_bmi1;
        }
    }
    
    /// Enhanced ARM64 feature detection for CpuFeatures  
    #[cfg(target_arch = "aarch64")]
    fn detect_arm_features(&self, features: &mut CpuFeatures) {
        // Most AArch64 systems have NEON
        features.has_neon = true;
        
        // Try to detect additional features through /proc/cpuinfo
        if let Ok(cpuinfo) = std::fs::read_to_string("/proc/cpuinfo") {
            let cpuinfo_lower = cpuinfo.to_lowercase();
            features.has_crc32 = cpuinfo_lower.contains("crc32");
            features.has_crypto = cpuinfo_lower.contains("aes") || cpuinfo_lower.contains("crypto");
            features.has_sve = cpuinfo_lower.contains("sve");
            features.has_sve2 = cpuinfo_lower.contains("sve2");
        }
        
        // Try runtime feature detection where available
        #[cfg(target_os = "linux")]
        {
            // Use getauxval if available for more reliable detection
            if let Ok(auxval) = self.get_auxval_features() {
                features.has_crc32 = (auxval & (1 << 7)) != 0;   // HWCAP_CRC32
                features.has_crypto = (auxval & (1 << 4)) != 0;  // HWCAP_AES
                features.has_sve = (auxval & (1 << 22)) != 0;    // HWCAP_SVE
            }
        }
    }
    
    /// Enhanced cache detection with more accurate sizing
    fn get_enhanced_cache_info(&self) -> (usize, usize, usize, usize) {
        let mut cache_line_size = 64; // Default assumption
        let l1_size = 32 * 1024; // 32KB default
        let l2_size = 256 * 1024; // 256KB default  
        let l3_size = 8 * 1024 * 1024; // 8MB default
        
        #[cfg(target_arch = "x86_64")]
        {
            let cpuid = raw_cpuid::CpuId::new();
            
            // Enhanced cache line size detection
            if let Some(cache_params) = cpuid.get_cache_parameters() {
                for cache in cache_params {
                    cache_line_size = cache.coherency_line_size() as usize;
                    
                    // Calculate cache size using available methods
                    // Note: exact calculation varies by raw_cpuid version
                    let cache_size = cache.associativity() * cache.coherency_line_size() 
                                   * cache.physical_line_partitions();
                    
                    // Cache level determination is simplified for compatibility
                    // Use cache_size for future enhanced detection
                    let _cache_size = cache_size; // Store for potential future use
                }
            }
            
            // Use fallback cache detection for compatibility
            // Cache line size is typically 64 bytes on modern x86_64
        }
        
        #[cfg(target_arch = "aarch64")]
        {
            // Try to get cache info from /sys/devices/system/cpu/
            if let Ok(entries) = std::fs::read_dir("/sys/devices/system/cpu/cpu0/cache") {
                for entry in entries.flatten() {
                    // Try to get coherency line size
                    if let Ok(coherency_str) = std::fs::read_to_string(entry.path().join("coherency_line_size")) {
                        if let Ok(coherency) = coherency_str.trim().parse::<usize>() {
                            cache_line_size = coherency;
                        }
                    }
                    
                    // Cache size detection is simplified for initial implementation
                    // Enhanced detection can be added in future iterations
                    if let Ok(_level_str) = std::fs::read_to_string(entry.path().join("level")) {
                        if let Ok(_size_str) = std::fs::read_to_string(entry.path().join("size")) {
                            // Cache size parsing available for future enhancement
                        }
                    }
                }
            }
        }
        
        (cache_line_size, l1_size, l2_size, l3_size)
    }
    
    /// Parse cache size string (e.g., "32K", "1M") to bytes
    fn parse_cache_size(&self, size_str: &str) -> Result<usize, std::num::ParseIntError> {
        let trimmed = size_str.trim().to_uppercase();
        if trimmed.ends_with('K') {
            let num = trimmed.trim_end_matches('K').parse::<usize>()?;
            Ok(num * 1024)
        } else if trimmed.ends_with('M') {
            let num = trimmed.trim_end_matches('M').parse::<usize>()?;
            Ok(num * 1024 * 1024)
        } else {
            trimmed.parse::<usize>()
        }
    }
    
    /// Get auxiliary vector features on Linux ARM64
    #[cfg(all(target_arch = "aarch64", target_os = "linux"))]
    fn get_auxval_features(&self) -> Result<u64, std::io::Error> {
        // This is a simplified version - in practice you'd use getauxval(AT_HWCAP)
        // For now, return a default that indicates we couldn't detect
        Ok(0)
    }
    

    /// Get CPU vendor and model information
    fn get_cpu_info(&self) -> (String, String) {
        #[cfg(target_arch = "x86_64")]
        {
            let cpuid = raw_cpuid::CpuId::new();
            let vendor = cpuid.get_vendor_info()
                .map(|v| v.as_str().to_string())
                .unwrap_or_else(|| "Unknown".to_string());
            let model = cpuid.get_processor_brand_string()
                .map(|b| b.as_str().to_string())
                .unwrap_or_else(|| "Unknown".to_string());
            (vendor, model)
        }

        #[cfg(target_arch = "aarch64")]
        {
            // Try to get ARM CPU info from /proc/cpuinfo
            if let Ok(cpuinfo) = std::fs::read_to_string("/proc/cpuinfo") {
                let mut vendor = "ARM".to_string();
                let mut model = "Unknown".to_string();
                
                for line in cpuinfo.lines() {
                    if line.starts_with("CPU implementer") {
                        if line.contains("0x41") {
                            vendor = "ARM".to_string();
                        } else if line.contains("0x51") {
                            vendor = "Qualcomm".to_string();
                        }
                    } else if line.starts_with("model name") {
                        if let Some(name) = line.split(':').nth(1) {
                            model = name.trim().to_string();
                        }
                    }
                }
                return (vendor, model);
            }
        }

        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
        {
            ("Unknown".to_string(), "Unknown".to_string())
        }
    }

    /// Get logical and physical core counts
    fn get_core_count(&self) -> (usize, usize) {
        let logical_cores = std::thread::available_parallelism()
            .map(|n| n.get())
            .unwrap_or(1);

        // Try to determine physical cores (this is approximate)
        let physical_cores = logical_cores; // Default assumption

        #[cfg(target_arch = "x86_64")]
        {
            let cpuid = raw_cpuid::CpuId::new();
            if let Some(feature_info) = cpuid.get_feature_info() {
                if feature_info.has_htt() {
                    // Hyperthreading is enabled, so physical cores = logical / 2
                    return (logical_cores, logical_cores / 2);
                }
            }
        }

        (logical_cores, physical_cores)
    }

    /// Get cache information (cache_line_size, l1_size, l2_size, l3_size)
    fn get_cache_info(&self) -> (usize, usize, usize, usize) {
        let mut cache_line_size = 64; // Default assumption
        let mut l1_size = 32 * 1024; // 32KB default
        let mut l2_size = 256 * 1024; // 256KB default  
        let mut l3_size = 8 * 1024 * 1024; // 8MB default

        #[cfg(target_arch = "x86_64")]
        {
            let cpuid = raw_cpuid::CpuId::new();
            
            // Get cache line size
            if let Some(mut cache_params) = cpuid.get_cache_parameters() {
                if let Some(cache) = cache_params.next() {
                    cache_line_size = cache.coherency_line_size() as usize;
                }
            }

            // Try to get cache sizes
            // Note: Cache size detection is complex and varies by CPU
            // For now, we use reasonable defaults and detect cache line size
        }

        (cache_line_size, l1_size, l2_size, l3_size)
    }
}

// Global CPU feature detection
static CPU_FEATURES: OnceLock<CpuFeatures> = OnceLock::new();

/// Get the global CPU features (detected once on first call)
/// 
/// This is the main API for accessing CPU features with comprehensive
/// detection capabilities following the SIMD implementation plan Phase 1.1
pub fn get_cpu_features() -> &'static CpuFeatures {
    CPU_FEATURES.get_or_init(|| {
        RuntimeCpuFeatures::new().detect_features()
    })
}

/// Check if a specific CPU feature is available
pub fn has_cpu_feature(feature: CpuFeature) -> bool {
    get_cpu_features().has_feature(feature)
}

/// Hardware-accelerated SIMD detection and configuration
/// 
/// This function implements the core functionality from Phase 1.1 of the SIMD plan:
/// Runtime CPU feature detection with build system integration patterns.
pub fn detect_and_configure_simd() -> &'static CpuFeatures {
    get_cpu_features()
}

/// Get optimal SIMD strategy for the current hardware
/// 
/// Returns the optimal implementation variant for different operation types
/// based on comprehensive hardware feature detection.
pub fn get_optimal_simd_strategy() -> SimdStrategy {
    let features = get_cpu_features();
    
    SimdStrategy {
        rank_select_variant: features.optimal_rank_select_variant(),
        string_search_variant: features.optimal_string_search_variant(),
        memcpy_variant: features.optimal_memcpy_variant(),
        chunk_size: features.recommended_chunk_size(),
        alignment: features.recommended_alignment(),
        use_prefetch: features.should_use_prefetch(),
        optimization_tier: features.optimization_tier,
        simd_tier: features.simd_tier,
    }
}

/// SIMD optimization strategy result
#[derive(Debug, Clone)]
pub struct SimdStrategy {
    pub rank_select_variant: &'static str,
    pub string_search_variant: &'static str,
    pub memcpy_variant: &'static str,
    pub chunk_size: usize,
    pub alignment: usize,
    pub use_prefetch: bool,
    pub optimization_tier: u8,
    pub simd_tier: u8,
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_cpu_feature_detection() {
        let features = get_cpu_features();
        
        // Basic sanity checks
        assert!(features.logical_cores > 0);
        assert!(features.physical_cores > 0);
        assert!(features.cache_line_size > 0);
        assert!(!features.vendor.is_empty());
        
        // SIMD tier should be reasonable
        assert!(features.simd_tier <= 4);
        
        println!("CPU: {} {}", features.vendor, features.model);
        println!("Cores: {} logical, {} physical", features.logical_cores, features.physical_cores);
        println!("Cache line size: {} bytes", features.cache_line_size);
        println!("SIMD tier: {}", features.simd_tier);
        println!("Optimal rank/select: {}", features.optimal_rank_select_variant());
        println!("Optimal base64: {}", features.optimal_base64_variant());
    }
    
    #[test]
    fn test_advanced_cpu_feature_detection() {
        let advanced_features = get_cpu_features();
        
        // Basic sanity checks
        assert!(advanced_features.logical_cores > 0);
        assert!(advanced_features.physical_cores > 0);
        assert!(advanced_features.cache_line_size > 0);
        assert!(!advanced_features.vendor.is_empty());
        
        // Optimization tiers should be reasonable
        assert!(advanced_features.optimization_tier <= 5);
        assert!(advanced_features.simd_tier <= 4);
        
        println!("=== Advanced CPU Features ===");
        println!("CPU: {} {}", advanced_features.vendor, advanced_features.model);
        println!("Cores: {} logical, {} physical", advanced_features.logical_cores, advanced_features.physical_cores);
        println!("Cache: L1={}, L2={}, L3={}, Line={}",
                 advanced_features.l1_cache_size,
                 advanced_features.l2_cache_size,
                 advanced_features.l3_cache_size,
                 advanced_features.cache_line_size);
        
        println!("x86_64 Features:");
        println!("  SSE4.1: {}, SSE4.2: {}", advanced_features.has_sse41, advanced_features.has_sse42);
        println!("  AVX: {}, AVX2: {}", advanced_features.has_avx, advanced_features.has_avx2);
        println!("  AVX-512F: {}, AVX-512VL: {}, AVX-512BW: {}", 
                 advanced_features.has_avx512f, advanced_features.has_avx512vl, advanced_features.has_avx512bw);
        println!("  BMI1: {}, BMI2: {}", advanced_features.has_bmi1, advanced_features.has_bmi2);
        println!("  POPCNT: {}, LZCNT: {}, TZCNT: {}", 
                 advanced_features.has_popcnt, advanced_features.has_lzcnt, advanced_features.has_tzcnt);
        
        println!("ARM64 Features:");
        println!("  NEON: {}, CRC32: {}", advanced_features.has_neon, advanced_features.has_crc32);
        println!("  Crypto: {}, SVE: {}, SVE2: {}", 
                 advanced_features.has_crypto, advanced_features.has_sve, advanced_features.has_sve2);
        
        println!("Optimization: Tier={}, SIMD={}", 
                 advanced_features.optimization_tier, advanced_features.simd_tier);
        println!("Optimal rank/select: {}", advanced_features.optimal_rank_select_variant());
        println!("Optimal string search: {}", advanced_features.optimal_string_search_variant());
        println!("Optimal memcpy: {}", advanced_features.optimal_memcpy_variant());
        println!("Recommended chunk size: {} bytes", advanced_features.recommended_chunk_size());
        println!("Recommended alignment: {} bytes", advanced_features.recommended_alignment());
        println!("Use prefetch: {}", advanced_features.should_use_prefetch());
    }
    
    #[test]
    fn test_simd_strategy() {
        let strategy = get_optimal_simd_strategy();
        
        // Strategy should have valid values
        assert!(!strategy.rank_select_variant.is_empty());
        assert!(!strategy.string_search_variant.is_empty());
        assert!(!strategy.memcpy_variant.is_empty());
        assert!(strategy.chunk_size >= 4096);
        assert!(strategy.chunk_size <= 65536);
        assert!(strategy.alignment >= 8);
        assert!(strategy.alignment <= 64);
        assert!(strategy.alignment.is_power_of_two());
        assert!(strategy.optimization_tier <= 5);
        assert!(strategy.simd_tier <= 4);
        
        println!("=== SIMD Strategy ===");
        println!("Rank/Select: {}", strategy.rank_select_variant);
        println!("String Search: {}", strategy.string_search_variant);
        println!("Memory Copy: {}", strategy.memcpy_variant);
        println!("Chunk Size: {} bytes", strategy.chunk_size);
        println!("Alignment: {} bytes", strategy.alignment);
        println!("Use Prefetch: {}", strategy.use_prefetch);
        println!("Optimization Tier: {}", strategy.optimization_tier);
        println!("SIMD Tier: {}", strategy.simd_tier);
    }
    
    #[test]
    fn test_detect_and_configure_simd() {
        let features = detect_and_configure_simd();
        
        // Should return the same instance as get_cpu_features
        let global_features = get_cpu_features();
        assert_eq!(features.optimization_tier, global_features.optimization_tier);
        assert_eq!(features.simd_tier, global_features.simd_tier);
        
        // Optimization tier should be calculated
        assert!(features.optimization_tier <= 5);
        assert!(features.simd_tier <= 4);
        
        println!("SIMD Configuration Complete:");
        println!("  Optimization Tier: {}", features.optimization_tier);
        println!("  SIMD Tier: {}", features.simd_tier);
    }

    #[test]
    fn test_has_cpu_feature() {
        // Test the convenience function
        let _has_popcnt = has_cpu_feature(CpuFeature::POPCNT);
        let _has_avx2 = has_cpu_feature(CpuFeature::AVX2);
        // Should not panic
    }

    #[test]
    fn test_feature_set_methods() {
        let features = get_cpu_features();
        
        // Test optimization variant selection
        let rank_select_variant = features.optimal_rank_select_variant();
        assert!(["scalar", "popcnt", "avx2", "bmi2_avx2", "avx512", "neon"].contains(&rank_select_variant));
        
        let base64_variant = features.optimal_base64_variant();
        assert!(["scalar", "sse42", "avx2", "neon"].contains(&base64_variant));
        
        // Test alignment recommendation
        let alignment = features.recommended_alignment();
        assert!(alignment >= 8 && alignment <= 64);
        assert!(alignment.is_power_of_two());
    }

    #[test]
    fn test_memory_access_patterns() {
        let features = get_cpu_features();
        
        // Test memory access optimization detection
        let _has_optimal = features.has_optimal_memory_access();
        
        // Cache line size should be reasonable (typically 64 bytes)
        assert!(features.cache_line_size >= 32 && features.cache_line_size <= 128);
    }
}