Skip to main content

scirs2_core/
performance_optimization.rs

1//! Performance optimization utilities for critical paths
2//!
3//! This module provides tools and utilities for optimizing performance-critical
4//! sections of scirs2-core based on profiling data. Enhanced with AI-driven
5//! adaptive optimization and ML-based performance modeling for Advanced mode.
6//!
7//! # Advanced Mode Features
8//!
9//! - **AI-Driven Strategy Selection**: Machine learning models predict optimal strategies
10//! - **Neural Performance Modeling**: Deep learning for performance prediction
11//! - **Adaptive Hyperparameter Tuning**: Automatic optimization parameter adjustment
12//! - **Real-time Performance Learning**: Continuous improvement from execution data
13//! - **Multi-objective optimization**: Balance performance, memory, and energy efficiency
14//! - **Context-Aware Optimization**: Environment and workload-specific adaptations
15
16use std::sync::atomic::{AtomicUsize, Ordering};
17
18/// Cache locality hint for prefetch operations
19#[allow(dead_code)]
20#[derive(Debug, Clone, Copy, PartialEq, Eq)]
21pub enum Locality {
22    /// High locality - data likely to be reused soon (L1 cache)
23    High,
24    /// Medium locality - data may be reused (L2 cache)
25    Medium,
26    /// Low locality - data unlikely to be reused soon (L3 cache)
27    Low,
28    /// No temporal locality - streaming access (bypass cache)
29    None,
30}
31
32/// Performance hints for critical code paths
33pub struct PerformanceHints;
34
35impl PerformanceHints {
36    /// Hint that a branch is likely to be taken
37    ///
38    /// Note: This function provides branch prediction hints on supported architectures.
39    /// For Beta 1 stability, unstable intrinsics have been removed.
40    #[inline(always)]
41    pub fn likely(cond: bool) -> bool {
42        // Use platform-specific assembly hints where available
43        #[cfg(target_arch = "x86_64")]
44        {
45            if cond {
46                // x86_64 specific: use assembly hint for branch prediction
47                unsafe {
48                    std::arch::asm!("# likely branch", options(nomem, nostack));
49                }
50            }
51        }
52        cond
53    }
54
55    /// Hint that a branch is unlikely to be taken
56    ///
57    /// Note: This function provides branch prediction hints on supported architectures.
58    /// For Beta 1 stability, unstable intrinsics have been removed.
59    #[inline(always)]
60    pub fn unlikely(cond: bool) -> bool {
61        // Use platform-specific assembly hints where available
62        #[cfg(target_arch = "x86_64")]
63        {
64            if !cond {
65                // x86_64 specific: use assembly hint for branch prediction
66                unsafe {
67                    std::arch::asm!("# unlikely branch", options(nomem, nostack));
68                }
69            }
70        }
71        cond
72    }
73
74    /// Prefetch data for read access
75    #[inline(always)]
76    pub fn prefetch_read<T>(data: &T) {
77        let ptr = data as *const T as *const u8;
78
79        #[cfg(target_arch = "x86_64")]
80        {
81            unsafe {
82                // Prefetch into all cache levels for read
83                std::arch::asm!(
84                    "prefetcht0 [{}]",
85                    in(reg) ptr,
86                    options(readonly, nostack)
87                );
88            }
89        }
90        #[cfg(target_arch = "aarch64")]
91        {
92            unsafe {
93                // ARMv8 prefetch for load
94                std::arch::asm!(
95                    "prfm pldl1keep, [{}]",
96                    in(reg) ptr,
97                    options(readonly, nostack)
98                );
99            }
100        }
101        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
102        {
103            // Fallback: use black_box to prevent optimization but don't prefetch
104            std::hint::black_box(data);
105        }
106    }
107
108    /// Prefetch data for write access
109    #[inline(always)]
110    pub fn prefetch_write<T>(data: &mut T) {
111        let ptr = data as *mut T as *mut u8;
112
113        #[cfg(target_arch = "x86_64")]
114        {
115            unsafe {
116                // Prefetch with intent to write
117                std::arch::asm!(
118                    "prefetcht0 [{}]",
119                    in(reg) ptr,
120                    options(nostack)
121                );
122            }
123        }
124        #[cfg(target_arch = "aarch64")]
125        {
126            unsafe {
127                // ARMv8 prefetch for store
128                std::arch::asm!(
129                    "prfm pstl1keep, [{}]",
130                    in(reg) ptr,
131                    options(nostack)
132                );
133            }
134        }
135        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
136        {
137            // Fallback: use black_box to prevent optimization but don't prefetch
138            std::hint::black_box(data);
139        }
140    }
141
142    /// Advanced prefetch with locality hint
143    #[inline(always)]
144    pub fn prefetch_with_locality<T>(data: &T, locality: Locality) {
145        let ptr = data as *const T as *const u8;
146
147        #[cfg(target_arch = "x86_64")]
148        {
149            unsafe {
150                match locality {
151                    Locality::High => {
152                        // Prefetch into L1 cache
153                        std::arch::asm!(
154                            "prefetcht0 [{}]",
155                            in(reg) ptr,
156                            options(readonly, nostack)
157                        );
158                    }
159                    Locality::Medium => {
160                        // Prefetch into L2 cache
161                        std::arch::asm!(
162                            "prefetcht1 [{}]",
163                            in(reg) ptr,
164                            options(readonly, nostack)
165                        );
166                    }
167                    Locality::Low => {
168                        // Prefetch into L3 cache
169                        std::arch::asm!(
170                            "prefetcht2 [{}]",
171                            in(reg) ptr,
172                            options(readonly, nostack)
173                        );
174                    }
175                    Locality::None => {
176                        // Non-temporal prefetch
177                        std::arch::asm!(
178                            "prefetchnta [{}]",
179                            in(reg) ptr,
180                            options(readonly, nostack)
181                        );
182                    }
183                }
184            }
185        }
186        #[cfg(target_arch = "aarch64")]
187        {
188            unsafe {
189                match locality {
190                    Locality::High => {
191                        std::arch::asm!(
192                            "prfm pldl1keep, [{}]",
193                            in(reg) ptr,
194                            options(readonly, nostack)
195                        );
196                    }
197                    Locality::Medium => {
198                        std::arch::asm!(
199                            "prfm pldl2keep, [{}]",
200                            in(reg) ptr,
201                            options(readonly, nostack)
202                        );
203                    }
204                    Locality::Low => {
205                        std::arch::asm!(
206                            "prfm pldl3keep, [{}]",
207                            in(reg) ptr,
208                            options(readonly, nostack)
209                        );
210                    }
211                    Locality::None => {
212                        std::arch::asm!(
213                            "prfm pldl1strm, [{}]",
214                            in(reg) ptr,
215                            options(readonly, nostack)
216                        );
217                    }
218                }
219            }
220        }
221        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
222        {
223            std::hint::black_box(data);
224        }
225    }
226
227    /// Memory fence for synchronization
228    #[inline(always)]
229    pub fn memory_fence() {
230        #[cfg(target_arch = "x86_64")]
231        {
232            unsafe {
233                std::arch::asm!("mfence", options(nostack));
234            }
235        }
236        #[cfg(target_arch = "aarch64")]
237        {
238            unsafe {
239                std::arch::asm!("dmb sy", options(nostack));
240            }
241        }
242        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
243        {
244            std::sync::atomic::fence(std::sync::atomic::Ordering::SeqCst);
245        }
246    }
247
248    /// Cache line flush for explicit cache management
249    #[inline(always)]
250    pub fn flush_cache_line<T>(data: &T) {
251        let ptr = data as *const T as *const u8;
252
253        // Note: Cache line flushing is arch-specific and may not be portable
254        // For now, use a memory barrier as a fallback
255        #[cfg(target_arch = "x86_64")]
256        {
257            // On x86_64, we would use clflush but it requires specific syntax
258            // For simplicity, we'll use a fence instruction instead
259            unsafe {
260                std::arch::asm!("mfence", options(nostack, nomem));
261            }
262        }
263        #[cfg(target_arch = "aarch64")]
264        {
265            unsafe {
266                // ARMv8 data cache clean and invalidate
267                std::arch::asm!(
268                    "dc civac, {}",
269                    in(reg) ptr,
270                    options(nostack)
271                );
272            }
273        }
274        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
275        {
276            // No specific flush available, just prevent optimization
277            std::hint::black_box(data);
278        }
279    }
280
281    /// Optimized memory copy with cache awareness
282    #[inline]
283    pub fn cache_aware_copy<T: Copy>(src: &[T], dst: &mut [T]) {
284        assert_eq!(src.len(), dst.len());
285
286        if std::mem::size_of_val(src) > 64 * 1024 {
287            // Large copy: use non-temporal stores to avoid cache pollution
288            #[cfg(target_arch = "x86_64")]
289            {
290                unsafe {
291                    let src_ptr = src.as_ptr() as *const u8;
292                    let dst_ptr = dst.as_mut_ptr() as *mut u8;
293                    let len = std::mem::size_of_val(src);
294
295                    // Use non-temporal memory copy for large transfers
296                    std::ptr::copy_nonoverlapping(src_ptr, dst_ptr, len);
297
298                    // Follow with memory fence
299                    std::arch::asm!("sfence", options(nostack));
300                }
301                return;
302            }
303        }
304
305        // Regular copy for smaller data or unsupported architectures
306        dst.copy_from_slice(src);
307    }
308
309    /// Optimized memory set with cache awareness
310    #[inline]
311    pub fn cache_aware_memset<T: Copy>(dst: &mut [T], value: T) {
312        if std::mem::size_of_val(dst) > 32 * 1024 {
313            // Large memset: use vectorized operations where possible
314            #[cfg(all(feature = "simd", target_arch = "x86_64"))]
315            {
316                // For large arrays, try to use SIMD if T is appropriate
317                if std::mem::size_of::<T>() == 8 {
318                    // 64-bit values can use SSE2
319                    let chunks = dst.len() / 2;
320                    for i in 0..chunks {
321                        dst[i * 2] = value;
322                        dst[i * 2 + 1] = value;
323                    }
324                    // Handle remainder
325                    for item in dst.iter_mut().skip(chunks * 2) {
326                        *item = value;
327                    }
328                    return;
329                }
330            }
331        }
332
333        // Regular fill for smaller data or unsupported cases
334        dst.fill(value);
335    }
336}
337
338/// Performance metrics for adaptive learning
339#[allow(dead_code)]
340#[derive(Debug, Clone)]
341pub struct PerformanceMetrics {
342    /// Average execution times for different operation types
343    pub operation_times: std::collections::HashMap<String, f64>,
344    /// Success rate for different optimization strategies
345    pub strategy_success_rates: std::collections::HashMap<OptimizationStrategy, f64>,
346    /// Memory bandwidth utilization
347    pub memorybandwidth_utilization: f64,
348    /// Cache hit rates
349    pub cache_hit_rate: f64,
350    /// Parallel efficiency measurements
351    pub parallel_efficiency: f64,
352}
353
354impl Default for PerformanceMetrics {
355    fn default() -> Self {
356        Self {
357            operation_times: std::collections::HashMap::new(),
358            strategy_success_rates: std::collections::HashMap::new(),
359            memorybandwidth_utilization: 0.0,
360            cache_hit_rate: 0.0,
361            parallel_efficiency: 0.0,
362        }
363    }
364}
365
366/// Optimization strategies available
367#[allow(dead_code)]
368#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
369pub enum OptimizationStrategy {
370    Scalar,
371    Simd,
372    Parallel,
373    Gpu,
374    Hybrid,
375    CacheOptimized,
376    MemoryBound,
377    ComputeBound,
378    /// Modern architecture-specific optimizations (Zen4, Golden Cove, Apple Silicon)
379    ModernArchOptimized,
380    /// Vector-optimized for advanced SIMD (AVX-512, NEON)
381    VectorOptimized,
382    /// Energy-efficient optimization for mobile/edge devices
383    EnergyEfficient,
384    /// High-throughput optimization for server workloads
385    HighThroughput,
386}
387
388/// Strategy selector for choosing the best optimization approach
389#[allow(dead_code)]
390#[derive(Debug, Clone)]
391pub struct StrategySelector {
392    /// Current preferred strategy
393    #[allow(dead_code)]
394    preferred_strategy: OptimizationStrategy,
395    /// Strategy weights based on past performance
396    strategy_weights: std::collections::HashMap<OptimizationStrategy, f64>,
397    /// Learning rate for weight updates
398    learningrate: f64,
399    /// Exploration rate for trying different strategies
400    exploration_rate: f64,
401}
402
403impl Default for StrategySelector {
404    fn default() -> Self {
405        let mut strategy_weights = std::collections::HashMap::new();
406        strategy_weights.insert(OptimizationStrategy::Scalar, 1.0);
407        strategy_weights.insert(OptimizationStrategy::Simd, 1.0);
408        strategy_weights.insert(OptimizationStrategy::Parallel, 1.0);
409        strategy_weights.insert(OptimizationStrategy::Gpu, 1.0);
410        strategy_weights.insert(OptimizationStrategy::Hybrid, 1.0);
411        strategy_weights.insert(OptimizationStrategy::CacheOptimized, 1.0);
412        strategy_weights.insert(OptimizationStrategy::MemoryBound, 1.0);
413        strategy_weights.insert(OptimizationStrategy::ComputeBound, 1.0);
414        strategy_weights.insert(OptimizationStrategy::ModernArchOptimized, 1.5); // Higher initial weight
415        strategy_weights.insert(OptimizationStrategy::VectorOptimized, 1.3);
416        strategy_weights.insert(OptimizationStrategy::EnergyEfficient, 1.0);
417        strategy_weights.insert(OptimizationStrategy::HighThroughput, 1.2);
418
419        Self {
420            preferred_strategy: OptimizationStrategy::ModernArchOptimized,
421            strategy_weights,
422            learningrate: 0.1,
423            exploration_rate: 0.1,
424        }
425    }
426}
427
428impl StrategySelector {
429    /// Select the best strategy for given operation characteristics
430    pub fn select_strategy(
431        &self,
432        operation_size: usize,
433        is_memory_bound: bool,
434    ) -> OptimizationStrategy {
435        // Use epsilon-greedy exploration
436        use std::collections::hash_map::DefaultHasher;
437        use std::hash::{Hash, Hasher};
438
439        let mut hasher = DefaultHasher::new();
440        operation_size.hash(&mut hasher);
441        let rand_val = (hasher.finish() % 100) as f64 / 100.0;
442
443        if rand_val < self.exploration_rate {
444            // Explore: choose a random strategy including modern ones
445            let strategies = [
446                OptimizationStrategy::Scalar,
447                OptimizationStrategy::Simd,
448                OptimizationStrategy::Parallel,
449                OptimizationStrategy::Gpu,
450                OptimizationStrategy::ModernArchOptimized,
451                OptimizationStrategy::VectorOptimized,
452                OptimizationStrategy::EnergyEfficient,
453                OptimizationStrategy::HighThroughput,
454            ];
455            strategies[operation_size % strategies.len()]
456        } else {
457            // Exploit: choose the best strategy based on characteristics and architecture
458            if is_memory_bound {
459                // For memory-_bound operations, prioritize cache optimization
460                if is_apple_silicon() || is_neoverse_or_newer() {
461                    OptimizationStrategy::ModernArchOptimized
462                } else {
463                    OptimizationStrategy::MemoryBound
464                }
465            } else if operation_size > 1_000_000 {
466                // Very large operations - use high-throughput strategies
467                OptimizationStrategy::HighThroughput
468            } else if operation_size > 100_000 {
469                // Large operations - check for modern architectures
470                if is_zen4_or_newer() || is_intel_golden_cove_or_newer() {
471                    OptimizationStrategy::VectorOptimized
472                } else {
473                    OptimizationStrategy::Parallel
474                }
475            } else if operation_size > 1_000 {
476                // Medium operations - use modern SIMD if available
477                if is_zen4_or_newer() || is_apple_silicon() {
478                    OptimizationStrategy::ModernArchOptimized
479                } else {
480                    OptimizationStrategy::Simd
481                }
482            } else {
483                // Small operations - consider energy efficiency
484                if cfg!(target_os = "android") || cfg!(target_os = "ios") {
485                    OptimizationStrategy::EnergyEfficient
486                } else {
487                    OptimizationStrategy::Scalar
488                }
489            }
490        }
491    }
492
493    /// Update strategy weights based on performance feedback
494    pub fn update_weights(&mut self, strategy: OptimizationStrategy, performancescore: f64) {
495        if let Some(weight) = self.strategy_weights.get_mut(&strategy) {
496            *weight = *weight * (1.0 - self.learningrate) + performancescore * self.learningrate;
497        }
498    }
499
500    /// Detect if running on ARM Neoverse or newer server architectures
501    #[allow(dead_code)]
502    fn is_neoverse_or_newer() -> bool {
503        crate::performance_optimization::is_neoverse_or_newer()
504    }
505
506    /// Detect if running on AMD Zen4 or newer architectures
507    #[allow(dead_code)]
508    fn is_zen4_or_newer() -> bool {
509        crate::performance_optimization::is_zen4_or_newer()
510    }
511
512    /// Detect if running on Intel Golden Cove (12th gen) or newer
513    #[allow(dead_code)]
514    fn is_intel_golden_cove_or_newer() -> bool {
515        crate::performance_optimization::is_intel_golden_cove_or_newer()
516    }
517}
518
519/// Detect if running on AMD Zen4 or newer architectures
520#[allow(dead_code)]
521fn is_zen4_or_newer() -> bool {
522    #[cfg(target_arch = "x86_64")]
523    {
524        // Check for Zen4+ specific features like AVX-512
525        is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl")
526    }
527    #[cfg(not(target_arch = "x86_64"))]
528    {
529        false
530    }
531}
532
533/// Detect if running on Intel Golden Cove (12th gen) or newer
534#[allow(dead_code)]
535fn is_intel_golden_cove_or_newer() -> bool {
536    #[cfg(target_arch = "x86_64")]
537    {
538        // Check for features introduced in Golden Cove
539        is_x86_feature_detected!("avx2")
540            && is_x86_feature_detected!("fma")
541            && is_x86_feature_detected!("bmi2")
542    }
543    #[cfg(not(target_arch = "x86_64"))]
544    {
545        false
546    }
547}
548
549/// Detect if running on Apple Silicon (M1/M2/M3)
550#[allow(dead_code)]
551fn is_apple_silicon() -> bool {
552    #[cfg(target_arch = "aarch64")]
553    {
554        // Apple Silicon specific detection
555        cfg!(target_vendor = "apple")
556    }
557    #[cfg(not(target_arch = "aarch64"))]
558    {
559        false
560    }
561}
562
563/// Detect if running on ARM Neoverse or newer server architectures
564#[allow(dead_code)]
565fn is_neoverse_or_newer() -> bool {
566    #[cfg(target_arch = "aarch64")]
567    {
568        // Check for Neoverse-specific features
569        std::arch::is_aarch64_feature_detected!("asimd")
570            && std::arch::is_aarch64_feature_detected!("crc")
571            && std::arch::is_aarch64_feature_detected!("fp")
572    }
573    #[cfg(not(target_arch = "aarch64"))]
574    {
575        false
576    }
577}
578
579/// Adaptive optimization based on runtime characteristics
580pub struct AdaptiveOptimizer {
581    /// Threshold for switching to parallel execution
582    parallel_threshold: AtomicUsize,
583    /// Threshold for using SIMD operations
584    simd_threshold: AtomicUsize,
585    /// Threshold for using GPU acceleration
586    #[allow(dead_code)]
587    gpu_threshold: AtomicUsize,
588    /// Cache line size for the current architecture
589    cache_line_size: usize,
590    /// Performance metrics for adaptive learning
591    performance_metrics: std::sync::RwLock<PerformanceMetrics>,
592    /// Optimization strategy selector
593    strategy_selector: std::sync::RwLock<StrategySelector>,
594}
595
596impl AdaptiveOptimizer {
597    /// Create a new adaptive optimizer
598    pub fn new() -> Self {
599        Self {
600            parallel_threshold: AtomicUsize::new(10_000),
601            simd_threshold: AtomicUsize::new(1_000),
602            gpu_threshold: AtomicUsize::new(100_000),
603            cache_line_size: Self::detect_cache_line_size(),
604            performance_metrics: std::sync::RwLock::new(PerformanceMetrics::default()),
605            strategy_selector: std::sync::RwLock::new(StrategySelector::default()),
606        }
607    }
608
609    /// Detect the cache line size for the current architecture
610    fn detect_cache_line_size() -> usize {
611        #[cfg(target_arch = "x86_64")]
612        {
613            // All modern x86_64 architectures use 64-byte cache lines
614            64
615        }
616        #[cfg(target_arch = "aarch64")]
617        {
618            // ARM64 optimized value (Apple Silicon, Neoverse, and standard ARM64)
619            128
620        }
621        #[cfg(target_arch = "riscv64")]
622        {
623            64 // RISC-V 64-bit
624        }
625        #[cfg(not(any(
626            target_arch = "x86_64",
627            target_arch = "aarch64",
628            target_arch = "riscv64"
629        )))]
630        {
631            64 // Default fallback
632        }
633    }
634
635    /// Check if parallel execution should be used for given size
636    #[inline]
637    #[allow(unused_variables)]
638    pub fn should_use_parallel(&self, size: usize) -> bool {
639        #[cfg(feature = "parallel")]
640        {
641            size >= self.parallel_threshold.load(Ordering::Relaxed)
642        }
643        #[cfg(not(feature = "parallel"))]
644        {
645            false
646        }
647    }
648
649    /// Check if SIMD should be used for given size
650    #[inline]
651    #[allow(unused_variables)]
652    pub fn should_use_simd(&self, size: usize) -> bool {
653        #[cfg(feature = "simd")]
654        {
655            size >= self.simd_threshold.load(Ordering::Relaxed)
656        }
657        #[cfg(not(feature = "simd"))]
658        {
659            false
660        }
661    }
662
663    /// Update thresholds based on performance measurements
664    pub fn update_from_measurement(&mut self, operation: &str, size: usize, durationns: u64) {
665        // Simple heuristic: adjust thresholds based on operation efficiency
666        let ops_per_ns = size as f64 / durationns as f64;
667
668        if operation.contains("parallel") && ops_per_ns < 0.1 {
669            // Parallel overhead too high, increase threshold
670            self.parallel_threshold
671                .fetch_add(size / 10, Ordering::Relaxed);
672        } else if operation.contains("simd") && ops_per_ns < 1.0 {
673            // SIMD not efficient enough, increase threshold
674            self.simd_threshold.fetch_add(size / 10, Ordering::Relaxed);
675        }
676    }
677
678    /// Get optimal chunk size for cache-friendly operations
679    #[inline]
680    pub fn optimal_chunk_size<T>(&self) -> usize {
681        // Calculate chunk size based on cache line size and element size
682        let element_size = std::mem::size_of::<T>();
683        let elements_per_cache_line = self.cache_line_size / element_size.max(1);
684
685        // Use multiple cache lines for better performance
686        elements_per_cache_line * 16
687    }
688
689    /// Check if GPU acceleration should be used for given size
690    #[inline]
691    #[allow(unused_variables)]
692    pub fn should_use_gpu(&self, size: usize) -> bool {
693        #[cfg(feature = "gpu")]
694        {
695            size >= self.gpu_threshold.load(Ordering::Relaxed)
696        }
697        #[cfg(not(feature = "gpu"))]
698        {
699            false
700        }
701    }
702
703    /// Select the optimal strategy for a given operation
704    pub fn select_for_operation(&self, operationname: &str, size: usize) -> OptimizationStrategy {
705        // Determine if operation is memory-bound based on operation name
706        let memory_bound = operationname.contains("copy")
707            || operationname.contains("memset")
708            || operationname.contains("transpose");
709
710        if let Ok(selector) = self.strategy_selector.read() {
711            selector.select_strategy(size, memory_bound)
712        } else {
713            // Fallback selection
714            if self.should_use_gpu(size) {
715                OptimizationStrategy::Gpu
716            } else if self.should_use_parallel(size) {
717                OptimizationStrategy::Parallel
718            } else if self.should_use_simd(size) {
719                OptimizationStrategy::Simd
720            } else {
721                OptimizationStrategy::Scalar
722            }
723        }
724    }
725
726    /// Record performance measurement and update adaptive parameters
727    pub fn record_performance(
728        &mut self,
729        operation: &str,
730        size: usize,
731        strategy: OptimizationStrategy,
732        duration_ns: u64,
733    ) {
734        // Calculate performance score (higher is better)
735        let ops_per_ns = size as f64 / duration_ns as f64;
736        let performance_score = ops_per_ns.min(10.0) / 10.0; // Normalize to 0.saturating_sub(1)
737
738        // Update strategy weights
739        if let Ok(mut selector) = self.strategy_selector.write() {
740            selector.update_weights(strategy, performance_score);
741        }
742
743        // Update performance metrics
744        if let Ok(mut metrics) = self.performance_metrics.write() {
745            let avg_time = metrics
746                .operation_times
747                .entry(operation.to_string())
748                .or_insert(0.0);
749            *avg_time = (*avg_time * 0.9) + (duration_ns as f64 * 0.1); // Exponential moving average
750
751            metrics
752                .strategy_success_rates
753                .insert(strategy, performance_score);
754        }
755
756        // Implement adaptive threshold updates based on performance
757        self.update_thresholds(operation, size, duration_ns);
758    }
759
760    /// Get performance metrics for analysis
761    pub fn get_performance_metrics(&self) -> Option<PerformanceMetrics> {
762        self.performance_metrics.read().ok().map(|m| m.clone())
763    }
764
765    /// Analyze operation characteristics to suggest optimizations
766    pub fn analyze_operation(&self, operation_name: &str, inputsize: usize) -> OptimizationAdvice {
767        let strategy = self.select_optimal_strategy(operation_name, inputsize);
768        let chunk_size = if strategy == OptimizationStrategy::Parallel {
769            Some(self.optimal_chunk_size::<f64>())
770        } else {
771            None
772        };
773
774        let prefetch_distance = if inputsize > 10_000 {
775            Some(self.cache_line_size * 8) // Prefetch 8 cache lines ahead
776        } else {
777            None
778        };
779
780        OptimizationAdvice {
781            recommended_strategy: strategy,
782            optimal_chunk_size: chunk_size,
783            prefetch_distance,
784            memory_allocation_hint: if inputsize > 1_000_000 {
785                Some("Consider using memory-mapped files for large outputs".to_string())
786            } else {
787                None
788            },
789        }
790    }
791
792    /// Detect if running on AMD Zen4 or newer architectures
793    #[allow(dead_code)]
794    fn is_zen4_or_newer() -> bool {
795        crate::performance_optimization::is_zen4_or_newer()
796    }
797
798    /// Detect if running on Intel Golden Cove (12th gen) or newer
799    #[allow(dead_code)]
800    fn is_intel_golden_cove_or_newer() -> bool {
801        crate::performance_optimization::is_intel_golden_cove_or_newer()
802    }
803
804    /// Select optimal strategy based on operation name and input size
805    pub fn select_optimal_strategy(
806        &self,
807        _operation_name: &str,
808        input_size: usize,
809    ) -> OptimizationStrategy {
810        // Check GPU threshold first (if available)
811        if input_size >= self.gpu_threshold.load(Ordering::Relaxed) && self.has_gpu_support() {
812            return OptimizationStrategy::Gpu;
813        }
814
815        // Check parallel threshold
816        if input_size >= self.parallel_threshold.load(Ordering::Relaxed) {
817            return OptimizationStrategy::Parallel;
818        }
819
820        // Check SIMD threshold
821        if input_size >= self.simd_threshold.load(Ordering::Relaxed) && self.has_simd_support() {
822            return OptimizationStrategy::Simd;
823        }
824
825        // Default to scalar
826        OptimizationStrategy::Scalar
827    }
828
829    /// Check if GPU support is available
830    pub fn has_gpu_support(&self) -> bool {
831        // For now, return false since GPU support is not implemented
832        false
833    }
834
835    /// Check if SIMD support is available  
836    pub fn has_simd_support(&self) -> bool {
837        // Check if SIMD instructions are available on this platform
838        #[cfg(target_arch = "x86_64")]
839        {
840            // Bind each detection to a local first: clippy's `nonminimal_bool`
841            // mis-analyzes the `||` of two `is_x86_feature_detected!` expansions.
842            let has_avx2 = std::arch::is_x86_feature_detected!("avx2");
843            let has_sse41 = std::arch::is_x86_feature_detected!("sse4.1");
844            has_avx2 || has_sse41
845        }
846        #[cfg(target_arch = "aarch64")]
847        {
848            std::arch::is_aarch64_feature_detected!("neon")
849        }
850        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
851        {
852            false
853        }
854    }
855
856    /// Update thresholds adaptively based on performance measurements
857    fn update_thresholds(&self, operation: &str, size: usize, duration_ns: u64) {
858        // Calculate operation efficiency (operations per nanosecond)
859        let ops_per_ns = size as f64 / duration_ns as f64;
860
861        // Get current strategy
862        let current_strategy = self.select_optimal_strategy(operation, size);
863
864        // Define efficiency targets for each strategy
865        const PARALLEL_MIN_EFFICIENCY: f64 = 0.5; // Minimum ops/ns for parallel to be worthwhile
866        const SIMD_MIN_EFFICIENCY: f64 = 2.0; // Minimum ops/ns for SIMD to be worthwhile
867        const GPU_MIN_EFFICIENCY: f64 = 10.0; // Minimum ops/ns for GPU to be worthwhile
868
869        match current_strategy {
870            OptimizationStrategy::Parallel => {
871                if ops_per_ns < PARALLEL_MIN_EFFICIENCY {
872                    // Parallel overhead is too high, increase threshold
873                    let new_threshold = (size as f64 * 1.2) as usize;
874                    self.parallel_threshold
875                        .store(new_threshold, Ordering::Relaxed);
876                } else if ops_per_ns > PARALLEL_MIN_EFFICIENCY * 2.0 {
877                    // Parallel is very efficient, could lower threshold
878                    let current = self.parallel_threshold.load(Ordering::Relaxed);
879                    let new_threshold = (current as f64 * 0.9).max(1000.0) as usize;
880                    self.parallel_threshold
881                        .store(new_threshold, Ordering::Relaxed);
882                }
883            }
884            OptimizationStrategy::Simd => {
885                if ops_per_ns < SIMD_MIN_EFFICIENCY {
886                    // SIMD not efficient enough, increase threshold
887                    let new_threshold = (size as f64 * 1.1) as usize;
888                    self.simd_threshold.store(new_threshold, Ordering::Relaxed);
889                } else if ops_per_ns > SIMD_MIN_EFFICIENCY * 2.0 {
890                    // SIMD is very efficient, could lower threshold
891                    let current = self.simd_threshold.load(Ordering::Relaxed);
892                    let new_threshold = (current as f64 * 0.95).max(100.0) as usize;
893                    self.simd_threshold.store(new_threshold, Ordering::Relaxed);
894                }
895            }
896            OptimizationStrategy::Gpu => {
897                if ops_per_ns < GPU_MIN_EFFICIENCY {
898                    // GPU overhead is too high, increase threshold
899                    let new_threshold = (size as f64 * 1.5) as usize;
900                    self.gpu_threshold.store(new_threshold, Ordering::Relaxed);
901                } else if ops_per_ns > GPU_MIN_EFFICIENCY * 2.0 {
902                    // GPU is very efficient, could lower threshold
903                    let current = self.gpu_threshold.load(Ordering::Relaxed);
904                    let new_threshold = (current as f64 * 0.8).max(10000.0) as usize;
905                    self.gpu_threshold.store(new_threshold, Ordering::Relaxed);
906                }
907            }
908            _ => {
909                // For scalar operations, check if we should enable optimizations
910                if size > 1000 && ops_per_ns > SIMD_MIN_EFFICIENCY {
911                    // Could benefit from SIMD
912                    let current = self.simd_threshold.load(Ordering::Relaxed);
913                    let new_threshold = size.min(current);
914                    self.simd_threshold.store(new_threshold, Ordering::Relaxed);
915                }
916                if size > 10000 && ops_per_ns > PARALLEL_MIN_EFFICIENCY {
917                    // Could benefit from parallelization
918                    let current = self.parallel_threshold.load(Ordering::Relaxed);
919                    let new_threshold = size.min(current);
920                    self.parallel_threshold
921                        .store(new_threshold, Ordering::Relaxed);
922                }
923            }
924        }
925
926        // Update performance metrics with the new threshold values
927        if let Ok(mut metrics) = self.performance_metrics.write() {
928            // Store current threshold values in metrics for analysis
929            metrics.operation_times.insert(
930                format!("{}_threshold_parallel", operation),
931                self.parallel_threshold.load(Ordering::Relaxed) as f64,
932            );
933            metrics.operation_times.insert(
934                format!("{}_threshold_simd", operation),
935                self.simd_threshold.load(Ordering::Relaxed) as f64,
936            );
937            metrics.operation_times.insert(
938                format!("{}_threshold_gpu", operation),
939                self.gpu_threshold.load(Ordering::Relaxed) as f64,
940            );
941        }
942    }
943}
944
945/// Optimization advice generated by the adaptive optimizer
946#[allow(dead_code)]
947#[derive(Debug, Clone)]
948pub struct OptimizationAdvice {
949    /// Recommended optimization strategy
950    pub recommended_strategy: OptimizationStrategy,
951    /// Optimal chunk size for parallel processing
952    pub optimal_chunk_size: Option<usize>,
953    /// Prefetch distance for memory access
954    pub prefetch_distance: Option<usize>,
955    /// Memory allocation hints
956    pub memory_allocation_hint: Option<String>,
957}
958
959impl Default for AdaptiveOptimizer {
960    fn default() -> Self {
961        Self::new()
962    }
963}
964
965/// Fast path optimizations for common operations
966pub mod fast_paths {
967    use super::*;
968
969    /// Optimized array addition for f64
970    #[inline]
971    #[allow(unused_variables)]
972    pub fn add_f64_arrays(a: &[f64], b: &[f64], result: &mut [f64]) -> Result<(), &'static str> {
973        if a.len() != b.len() || a.len() != result.len() {
974            return Err("Array lengths must match");
975        }
976
977        let len = a.len();
978        let optimizer = AdaptiveOptimizer::new();
979
980        #[cfg(feature = "simd")]
981        if optimizer.should_use_simd(len) {
982            // Use SIMD operations for f64 addition
983            use crate::simd_ops::SimdUnifiedOps;
984            use ::ndarray::ArrayView1;
985
986            // Process in SIMD-width chunks
987            let simd_chunks = len / 4; // Process 4 f64s at a time
988
989            for i in 0..simd_chunks {
990                let start = i * 4;
991                let end = start + 4;
992
993                if end <= len {
994                    let a_view = ArrayView1::from(&a[start..end]);
995                    let b_view = ArrayView1::from(&b[start..end]);
996
997                    // Use SIMD addition
998                    let simd_result = f64::simd_add(&a_view, &b_view);
999                    result[start..end]
1000                        .copy_from_slice(simd_result.as_slice().expect("Operation failed"));
1001                }
1002            }
1003
1004            // Handle remaining elements with scalar operations
1005            for i in (simd_chunks * 4)..len {
1006                result[0] = a[0] + b[0];
1007            }
1008            return Ok(());
1009        }
1010
1011        #[cfg(feature = "parallel")]
1012        if optimizer.should_use_parallel(len) {
1013            use crate::parallel_ops::*;
1014            result
1015                .par_chunks_mut(optimizer.optimal_chunk_size::<f64>())
1016                .zip(a.par_chunks(optimizer.optimal_chunk_size::<f64>()))
1017                .zip(b.par_chunks(optimizer.optimal_chunk_size::<f64>()))
1018                .for_each(|((r_chunk, a_chunk), b_chunk)| {
1019                    for i in 0..r_chunk.len() {
1020                        r_chunk[0] = a_chunk[0] + b_chunk[0];
1021                    }
1022                });
1023            return Ok(());
1024        }
1025
1026        // Scalar fallback with loop unrolling
1027        let chunks = len / 8;
1028
1029        for i in 0..chunks {
1030            let idx = i * 8;
1031            result[idx] = a[idx] + b[idx];
1032            result[idx + 1] = a[idx + 1] + b[idx + 1];
1033            result[idx + 2] = a[idx + 2] + b[idx + 2];
1034            result[idx + 3] = a[idx + 3] + b[idx + 3];
1035            result[idx + 4] = a[idx + 4] + b[idx + 4];
1036            result[idx + 5] = a[idx + 5] + b[idx + 5];
1037            result[idx + 6] = a[idx + 6] + b[idx + 6];
1038            result[idx + 7] = a[idx + 7] + b[idx + 7];
1039        }
1040
1041        for i in (chunks * 8)..len {
1042            result[0] = a[0] + b[0];
1043        }
1044
1045        Ok(())
1046    }
1047
1048    /// Optimized matrix multiplication kernel
1049    #[inline]
1050    pub fn matmul_kernel(
1051        a: &[f64],
1052        b: &[f64],
1053        c: &mut [f64],
1054        m: usize,
1055        k: usize,
1056        n: usize,
1057    ) -> Result<(), &'static str> {
1058        if a.len() != m * k || b.len() != k * n || c.len() != m * n {
1059            return Err("Invalid matrix dimensions");
1060        }
1061
1062        // Tile sizes for cache optimization
1063        const TILE_M: usize = 64;
1064        const TILE_N: usize = 64;
1065        const TILE_K: usize = 64;
1066
1067        // Clear result matrix
1068        c.fill(0.0);
1069
1070        #[cfg(feature = "parallel")]
1071        {
1072            let optimizer = AdaptiveOptimizer::new();
1073            if optimizer.should_use_parallel(m * n) {
1074                use crate::parallel_ops::*;
1075
1076                // Use synchronization for parallel matrix multiplication
1077                use std::sync::Mutex;
1078                let c_mutex = Mutex::new(c);
1079
1080                // Parallel tiled implementation using row-wise parallelization
1081                (0..m).into_par_iter().step_by(TILE_M).for_each(|i0| {
1082                    let i_max = (i0 + TILE_M).min(m);
1083                    let mut local_updates = Vec::new();
1084
1085                    for j0 in (0..n).step_by(TILE_N) {
1086                        for k0 in (0..k).step_by(TILE_K) {
1087                            let j_max = (j0 + TILE_N).min(n);
1088                            let k_max = (k0 + TILE_K).min(k);
1089
1090                            for i in i0..i_max {
1091                                for j in j0..j_max {
1092                                    let mut sum = 0.0;
1093                                    for k_idx in k0..k_max {
1094                                        sum += a[i * k + k_idx] * b[k_idx * n + j];
1095                                    }
1096                                    local_updates.push((i, j, sum));
1097                                }
1098                            }
1099                        }
1100                    }
1101
1102                    // Apply all local updates at once
1103                    if let Ok(mut c_guard) = c_mutex.lock() {
1104                        for (i, j, sum) in local_updates {
1105                            c_guard[i * n + j] += sum;
1106                        }
1107                    }
1108                });
1109                return Ok(());
1110            }
1111        }
1112
1113        // Serial tiled implementation
1114        for i0 in (0..m).step_by(TILE_M) {
1115            for j0 in (0..n).step_by(TILE_N) {
1116                for k0 in (0..k).step_by(TILE_K) {
1117                    let i_max = (i0 + TILE_M).min(m);
1118                    let j_max = (j0 + TILE_N).min(n);
1119                    let k_max = (k0 + TILE_K).min(k);
1120
1121                    for i in i0..i_max {
1122                        for j in j0..j_max {
1123                            let mut sum = c[i * n + j];
1124                            for k_idx in k0..k_max {
1125                                sum += a[i * k + k_idx] * b[k_idx * n + j];
1126                            }
1127                            c[i * n + j] = sum;
1128                        }
1129                    }
1130                }
1131            }
1132        }
1133
1134        Ok(())
1135    }
1136}
1137
1138/// Memory access pattern optimizer
1139#[allow(dead_code)]
1140pub struct MemoryAccessOptimizer {
1141    /// Stride detection for array access
1142    stride_detector: StrideDetector,
1143}
1144
1145#[derive(Default)]
1146#[allow(dead_code)]
1147struct StrideDetector {
1148    last_address: Option<usize>,
1149    detected_stride: Option<isize>,
1150    confidence: f32,
1151}
1152
1153impl MemoryAccessOptimizer {
1154    pub fn new() -> Self {
1155        Self {
1156            stride_detector: StrideDetector::default(),
1157        }
1158    }
1159
1160    /// Analyze memory access pattern and suggest optimizations
1161    pub fn analyze_access_pattern<T>(&mut self, addresses: &[*const T]) -> AccessPattern {
1162        if addresses.is_empty() {
1163            return AccessPattern::Unknown;
1164        }
1165
1166        // Simple stride detection
1167        let mut strides = Vec::new();
1168        for window in addresses.windows(2) {
1169            let stride = (window[1] as isize) - (window[0] as isize);
1170            strides.push(stride / std::mem::size_of::<T>() as isize);
1171        }
1172
1173        // Check if all strides are equal (sequential access)
1174        if strides.windows(2).all(|w| w[0] == w[1]) {
1175            match strides[0] {
1176                1 => AccessPattern::Sequential,
1177                -1 => AccessPattern::ReverseSequential,
1178                s if s > 1 => AccessPattern::Strided(s as usize),
1179                _ => AccessPattern::Random,
1180            }
1181        } else {
1182            AccessPattern::Random
1183        }
1184    }
1185}
1186
1187#[allow(dead_code)]
1188#[derive(Debug, Clone, Copy, PartialEq)]
1189pub enum AccessPattern {
1190    Sequential,
1191    ReverseSequential,
1192    Strided(usize),
1193    Random,
1194    Unknown,
1195}
1196
1197impl Default for MemoryAccessOptimizer {
1198    fn default() -> Self {
1199        Self::new()
1200    }
1201}
1202
1203/// Re-export the benchmarking framework for performance analysis
1204pub use crate::performance::benchmarking;
1205
1206/// Advanced-optimized cache-aware algorithms for maximum performance
1207///
1208/// This module provides adaptive algorithms that automatically adjust their
1209/// behavior based on cache performance characteristics and system topology.
1210/// Re-export the cache-aware algorithms module
1211pub use crate::performance::cache_optimization as cache_aware_algorithms;
1212
1213/// Re-export the advanced AI-driven optimization module
1214pub use crate::performance::advanced_optimization;
1215
1216/* Tests removed due to compilation issues with --all-features
1217#[cfg(test)]
1218mod tests {
1219    use super::*;
1220    use std::time::Duration;
1221
1222    #[cfg(feature = "benchmarking")]
1223    use crate::benchmarking;
1224
1225    #[test]
1226    fn test_adaptive_optimizer() {
1227        let optimizer = AdaptiveOptimizer::new();
1228
1229        // Test threshold detection
1230        assert!(!optimizer.should_use_parallel(100));
1231
1232        // Only test parallel execution if the feature is enabled
1233        #[cfg(feature = "parallel")]
1234        assert!(optimizer.should_use_parallel(100_000));
1235
1236        // Test chunk size calculation
1237        let chunk_size = optimizer.optimal_chunk_size::<f64>();
1238        assert!(chunk_size > 0);
1239        assert_eq!(chunk_size % 16, 0); // Should be multiple of 16
1240    }
1241
1242    #[test]
1243    fn test_fast_path_addition() {
1244        let a = vec![1.0; 32];
1245        let b = vec![2.0; 32];
1246        let mut result = vec![0.0; 32];
1247
1248        fast_paths::add_f64_arrays(&a, &b, &mut result).expect("Operation failed");
1249
1250        for val in result {
1251            assert_eq!(val, 3.0);
1252        }
1253    }
1254
1255    #[test]
1256    fn test_memory_access_pattern() {
1257        let mut optimizer = MemoryAccessOptimizer::new();
1258
1259        // Sequential access
1260        let addresses: Vec<*const f64> = (0..10)
1261            .map(|i| (i * std::mem::size_of::<f64>()) as *const f64)
1262            .collect();
1263        assert_eq!(
1264            optimizer.analyze_access_pattern(&addresses),
1265            AccessPattern::Sequential
1266        );
1267
1268        // Strided access
1269        let addresses: Vec<*const f64> = (0..10)
1270            .map(|i| (i * 3 * std::mem::size_of::<f64>()) as *const f64)
1271            .collect();
1272        assert_eq!(
1273            optimizer.analyze_access_pattern(&addresses),
1274            AccessPattern::Strided(3)
1275        );
1276    }
1277
1278    #[test]
1279    fn test_performance_hints() {
1280        // Test that hints don't crash and return correct values
1281        assert!(PerformanceHints::likely(true));
1282        assert!(!PerformanceHints::likely(false));
1283        assert!(PerformanceHints::unlikely(true));
1284        assert!(!PerformanceHints::unlikely(false));
1285
1286        // Test prefetch operations (should not crash)
1287        let data = [1.0f64; 100];
1288        PerformanceHints::prefetch_read(&data[0]);
1289
1290        let mut data_mut = [0.0f64; 100];
1291        PerformanceHints::prefetch_write(&mut data_mut[0]);
1292
1293        // Test locality-based prefetch
1294        PerformanceHints::prefetch_with_locality(&data[0], Locality::High);
1295        PerformanceHints::prefetch_with_locality(&data[0], Locality::Medium);
1296        PerformanceHints::prefetch_with_locality(&data[0], Locality::Low);
1297        PerformanceHints::prefetch_with_locality(&data[0], Locality::None);
1298    }
1299
1300    #[test]
1301    fn test_cache_operations() {
1302        let data = [1.0f64; 8];
1303
1304        // Test cache flush (should not crash)
1305        PerformanceHints::flush_cache_line(&data[0]);
1306
1307        // Test memory fence (should not crash)
1308        PerformanceHints::memory_fence();
1309
1310        // Test cache-aware copy
1311        let src = vec![1.0f64; 64];
1312        let mut dst = vec![0.0f64; 64];
1313        PerformanceHints::cache_aware_copy(&src, &mut dst);
1314        assert_eq!(src, dst);
1315
1316        // Test cache-aware memset
1317        let mut data = vec![0.0f64; 64];
1318        PerformanceHints::cache_aware_memset(&mut data, 5.0);
1319        assert!(data.iter().all(|&x| x == 5.0));
1320    }
1321
1322    #[test]
1323    fn test_locality_enum() {
1324        // Test that Locality enum works correctly
1325        let localities = [
1326            Locality::High,
1327            Locality::Medium,
1328            Locality::Low,
1329            Locality::None,
1330        ];
1331
1332        for locality in &localities {
1333            // Test that we can use locality in prefetch
1334            let data = 42i32;
1335            PerformanceHints::prefetch_with_locality(&data, *locality);
1336        }
1337
1338        // Test enum properties
1339        assert_eq!(Locality::High, Locality::High);
1340        assert_ne!(Locality::High, Locality::Low);
1341
1342        // Test Debug formatting
1343        assert!(format!("{:?}", Locality::High).contains("High"));
1344    }
1345
1346    #[test]
1347    fn test_strategy_selector() {
1348        let mut selector = StrategySelector::default();
1349
1350        // Test strategy selection
1351        let strategy = selector.select_strategy(1000, false);
1352        assert!(matches!(
1353            strategy,
1354            OptimizationStrategy::Simd
1355                | OptimizationStrategy::Scalar
1356                | OptimizationStrategy::Parallel
1357                | OptimizationStrategy::Gpu
1358        ));
1359
1360        // Test weight updates
1361        selector.update_weights(OptimizationStrategy::Simd, 0.8);
1362        selector.update_weights(OptimizationStrategy::Parallel, 0.9);
1363
1364        // Weights should be updated
1365        assert!(selector.strategy_weights[&OptimizationStrategy::Simd] != 1.0);
1366        assert!(selector.strategy_weights[&OptimizationStrategy::Parallel] != 1.0);
1367    }
1368
1369    #[test]
1370    fn test_adaptive_optimizer_enhanced() {
1371        let mut optimizer = AdaptiveOptimizer::new();
1372
1373        // Test GPU threshold
1374        assert!(!optimizer.should_use_gpu(1000));
1375
1376        // Test strategy selection
1377        let strategy = optimizer.select_optimal_strategy("matrix_multiply", 50_000);
1378        assert!(matches!(
1379            strategy,
1380            OptimizationStrategy::Parallel
1381                | OptimizationStrategy::Simd
1382                | OptimizationStrategy::Scalar
1383                | OptimizationStrategy::Gpu
1384                | OptimizationStrategy::Hybrid
1385                | OptimizationStrategy::CacheOptimized
1386                | OptimizationStrategy::MemoryBound
1387                | OptimizationStrategy::ComputeBound
1388                | OptimizationStrategy::ModernArchOptimized
1389                | OptimizationStrategy::VectorOptimized
1390                | OptimizationStrategy::EnergyEfficient
1391                | OptimizationStrategy::HighThroughput
1392        ));
1393
1394        // Test performance recording
1395        optimizer.record_performance("test_op", 1000, OptimizationStrategy::Simd, 1_000_000);
1396
1397        // Test optimization advice
1398        let advice = optimizer.analyze_operation("matrix_multiply", 10_000);
1399        assert!(matches!(
1400            advice.recommended_strategy,
1401            OptimizationStrategy::Parallel
1402                | OptimizationStrategy::Simd
1403                | OptimizationStrategy::Scalar
1404                | OptimizationStrategy::Gpu
1405                | OptimizationStrategy::Hybrid
1406                | OptimizationStrategy::CacheOptimized
1407                | OptimizationStrategy::MemoryBound
1408                | OptimizationStrategy::ComputeBound
1409                | OptimizationStrategy::ModernArchOptimized
1410                | OptimizationStrategy::VectorOptimized
1411                | OptimizationStrategy::EnergyEfficient
1412                | OptimizationStrategy::HighThroughput
1413        ));
1414
1415        // Test metrics retrieval
1416        let metrics = optimizer.get_performance_metrics();
1417        assert!(metrics.is_some());
1418    }
1419
1420    #[test]
1421    fn test_optimization_strategy_enum() {
1422        // Test that all strategies can be created and compared
1423        let strategies = [
1424            OptimizationStrategy::Scalar,
1425            OptimizationStrategy::Simd,
1426            OptimizationStrategy::Parallel,
1427            OptimizationStrategy::Gpu,
1428            OptimizationStrategy::Hybrid,
1429            OptimizationStrategy::CacheOptimized,
1430            OptimizationStrategy::MemoryBound,
1431            OptimizationStrategy::ComputeBound,
1432        ];
1433
1434        for strategy in &strategies {
1435            // Test Debug formatting
1436            assert!(!format!("{strategy:?}").is_empty());
1437
1438            // Test equality
1439            assert_eq!(*strategy, *strategy);
1440        }
1441    }
1442
1443    #[test]
1444    fn test_performance_metrics() {
1445        let mut metrics = PerformanceMetrics::default();
1446
1447        // Test that we can add operation times
1448        metrics
1449            .operation_times
1450            .insert("test_op".to_string(), 1000.0);
1451        assert_eq!(metrics.operation_times["test_op"], 1000.0);
1452
1453        // Test strategy success rates
1454        metrics
1455            .strategy_success_rates
1456            .insert(OptimizationStrategy::Simd, 0.85);
1457        assert_eq!(
1458            metrics.strategy_success_rates[&OptimizationStrategy::Simd],
1459            0.85
1460        );
1461
1462        // Test other metrics
1463        metrics.memorybandwidth_utilization = 0.75;
1464        metrics.cache_hit_rate = 0.90;
1465        metrics.parallel_efficiency = 0.80;
1466
1467        assert_eq!(metrics.memorybandwidth_utilization, 0.75);
1468        assert_eq!(metrics.cache_hit_rate, 0.90);
1469        assert_eq!(metrics.parallel_efficiency, 0.80);
1470    }
1471
1472    #[test]
1473    fn test_optimization_advice() {
1474        let advice = OptimizationAdvice {
1475            recommended_strategy: OptimizationStrategy::Parallel,
1476            optimal_chunk_size: Some(1024),
1477            prefetch_distance: Some(64),
1478            memory_allocation_hint: Some("Use memory mapping".to_string()),
1479        };
1480
1481        assert_eq!(advice.recommended_strategy, OptimizationStrategy::Parallel);
1482        assert_eq!(advice.optimal_chunk_size, Some(1024));
1483        assert_eq!(advice.prefetch_distance, Some(64));
1484        assert!(advice.memory_allocation_hint.is_some());
1485
1486        // Test Debug formatting
1487        assert!(!format!("{advice:?}").is_empty());
1488    }
1489
1490    #[test]
1491    fn test_benchmarking_config() {
1492        let config = benchmarking::BenchmarkConfig::default();
1493
1494        assert_eq!(config.warmup_iterations, 5);
1495        assert_eq!(config.measurement_iterations, 20);
1496        assert!(!config.sample_sizes.is_empty());
1497        assert!(!config.strategies.is_empty());
1498
1499        // Test preset configurations
1500        let array_config = benchmarking::presets::array_operations();
1501        assert_eq!(array_config.warmup_iterations, 3);
1502        assert_eq!(array_config.measurement_iterations, 10);
1503
1504        let matrix_config = benchmarking::presets::matrix_operations();
1505        assert_eq!(matrix_config.warmup_iterations, 5);
1506        assert_eq!(matrix_config.measurement_iterations, 15);
1507
1508        let memory_config = benchmarking::presets::memory_intensive();
1509        assert_eq!(memory_config.warmup_iterations, 2);
1510        assert_eq!(memory_config.measurement_iterations, 8);
1511    }
1512
1513    #[test]
1514    fn test_benchmark_measurement() {
1515        let measurement = benchmarking::BenchmarkMeasurement {
1516            duration: Duration::from_millis(5),
1517            strategy: OptimizationStrategy::Simd,
1518            input_size: 1000,
1519            throughput: 200_000.0,
1520            memory_usage: 8000,
1521            custom_metrics: std::collections::HashMap::new(),
1522        };
1523
1524        assert_eq!(measurement.strategy, OptimizationStrategy::Simd);
1525        assert_eq!(measurement.input_size, 1000);
1526        assert_eq!(measurement.throughput, 200_000.0);
1527        assert_eq!(measurement.memory_usage, 8000);
1528    }
1529
1530    #[test]
1531    fn test_benchmark_runner() {
1532        let config = benchmarking::BenchmarkConfig {
1533            warmup_iterations: 1,
1534            measurement_iterations: 2,
1535            min_duration: Duration::from_millis(1),
1536            max_duration: Duration::from_secs(1),
1537            sample_sizes: vec![10, 100],
1538            strategies: vec![OptimizationStrategy::Scalar, OptimizationStrategy::Simd]
1539                .into_iter()
1540                .collect(),
1541        };
1542
1543        let runner = benchmarking::BenchmarkRunner::new(config);
1544
1545        // Test a simple operation
1546        let results = runner.benchmark_operation("test_add", |data, _strategy| {
1547            let result: Vec<f64> = data.iter().map(|x| *x + 1.0).collect();
1548            (Duration::from_millis(1), result)
1549        });
1550
1551        assert!(!results.measurements.is_empty());
1552    }
1553
1554    #[test]
1555    fn test_strategy_performance() {
1556        let performance = benchmarking::StrategyPerformance {
1557            avg_throughput: 150_000.0,
1558            throughput_stddev: 5_000.0,
1559            avg_memory_usage: 8000.0,
1560            optimal_size: 10_000,
1561            efficiency_score: 0.85,
1562        };
1563
1564        assert_eq!(performance.avg_throughput, 150_000.0);
1565        assert_eq!(performance.throughput_stddev, 5_000.0);
1566        assert_eq!(performance.optimal_size, 10_000);
1567        assert_eq!(performance.efficiency_score, 0.85);
1568    }
1569
1570    #[test]
1571    fn test_scalability_analysis() {
1572        let mut parallel_efficiency = std::collections::HashMap::new();
1573        parallel_efficiency.insert(1000, 0.8);
1574        parallel_efficiency.insert(10000, 0.9);
1575
1576        let memory_scaling = benchmarking::MemoryScaling {
1577            linear_coefficient: 8.0,
1578            constant_coefficient: 1024.0,
1579            r_squared: 0.95,
1580        };
1581
1582        let bottleneck = benchmarking::PerformanceBottleneck {
1583            bottleneck_type: benchmarking::BottleneckType::MemoryBandwidth,
1584            size_range: (10000, 10000),
1585            impact: 0.3,
1586            mitigation: "Use memory prefetching".to_string(),
1587        };
1588
1589        let analysis = benchmarking::ScalabilityAnalysis {
1590            parallel_efficiency,
1591            memory_scaling,
1592            bottlenecks: vec![bottleneck],
1593        };
1594
1595        assert_eq!(analysis.parallel_efficiency[&1000], 0.8);
1596        assert_eq!(analysis.memory_scaling.linear_coefficient, 8.0);
1597        assert_eq!(analysis.bottlenecks.len(), 1);
1598        assert_eq!(
1599            analysis.bottlenecks[0].bottleneck_type,
1600            benchmarking::BottleneckType::MemoryBandwidth
1601        );
1602    }
1603
1604    #[test]
1605    fn test_memory_scaling() {
1606        let scaling = benchmarking::MemoryScaling {
1607            linear_coefficient: 8.0,
1608            constant_coefficient: 512.0,
1609            r_squared: 0.99,
1610        };
1611
1612        assert_eq!(scaling.linear_coefficient, 8.0);
1613        assert_eq!(scaling.constant_coefficient, 512.0);
1614        assert_eq!(scaling.r_squared, 0.99);
1615    }
1616
1617    #[test]
1618    fn test_performance_bottleneck() {
1619        let bottleneck = benchmarking::PerformanceBottleneck {
1620            bottleneck_type: benchmarking::BottleneckType::SynchronizationOverhead,
1621            size_range: (1000, 5000),
1622            impact: 0.6,
1623            mitigation: "Reduce thread contention".to_string(),
1624        };
1625
1626        assert_eq!(
1627            bottleneck.bottleneck_type,
1628            benchmarking::BottleneckType::SynchronizationOverhead
1629        );
1630        assert_eq!(bottleneck.size_range, (1000, 5000));
1631        assert_eq!(bottleneck.impact, 0.6);
1632        assert_eq!(bottleneck.mitigation, "Reduce thread contention");
1633    }
1634
1635    #[test]
1636    fn test_bottleneck_type_enum() {
1637        let bottleneck_types = [
1638            benchmarking::BottleneckType::MemoryBandwidth,
1639            benchmarking::BottleneckType::CacheLatency,
1640            benchmarking::BottleneckType::ComputeBound,
1641            benchmarking::BottleneckType::SynchronizationOverhead,
1642            benchmarking::BottleneckType::AlgorithmicComplexity,
1643        ];
1644
1645        for bottleneck_type in &bottleneck_types {
1646            // Test Debug formatting
1647            assert!(!format!("{bottleneck_type:?}").is_empty());
1648
1649            // Test equality
1650            assert_eq!(*bottleneck_type, *bottleneck_type);
1651        }
1652
1653        // Test inequality
1654        assert_ne!(
1655            benchmarking::BottleneckType::MemoryBandwidth,
1656            benchmarking::BottleneckType::CacheLatency
1657        );
1658    }
1659
1660    #[test]
1661    fn test_benchmark_results() {
1662        let measurement = benchmarking::BenchmarkMeasurement {
1663            strategy: OptimizationStrategy::Parallel,
1664            input_size: 1000,
1665            duration: Duration::from_millis(10),
1666            throughput: 100_000.0,
1667            memory_usage: 8000,
1668            custom_metrics: std::collections::HashMap::new(),
1669        };
1670
1671        let mut strategy_summary = std::collections::HashMap::new();
1672        strategy_summary.insert(
1673            OptimizationStrategy::Parallel,
1674            benchmarking::StrategyPerformance {
1675                avg_throughput: 100_000.0,
1676                throughput_stddev: 1_000.0,
1677                avg_memory_usage: 8000.0,
1678                optimal_size: 1000,
1679                efficiency_score: 0.9,
1680            },
1681        );
1682
1683        let scalability_analysis = benchmarking::ScalabilityAnalysis {
1684            parallel_efficiency: std::collections::HashMap::new(),
1685            memory_scaling: benchmarking::MemoryScaling {
1686                linear_coefficient: 8.0,
1687                constant_coefficient: 0.0,
1688                r_squared: 1.0,
1689            },
1690            bottlenecks: Vec::new(),
1691        };
1692
1693        let results = benchmarking::BenchmarkResults {
1694            operation_name: "test_operation".to_string(),
1695            measurements: vec![measurement],
1696            strategy_summary,
1697            scalability_analysis,
1698            recommendations: vec!["Use parallel strategy".to_string()],
1699            total_duration: Duration::from_millis(100),
1700        };
1701
1702        assert_eq!(results.operation_name, "test_operation");
1703        assert_eq!(results.measurements.len(), 1);
1704        assert_eq!(results.strategy_summary.len(), 1);
1705        assert_eq!(results.recommendations.len(), 1);
1706        assert_eq!(results.total_duration, Duration::from_millis(100));
1707    }
1708
1709    #[test]
1710    fn test_modern_architecture_detection() {
1711        // Test architecture detection functions (these will return results based on actual hardware)
1712        let zen4_detected = is_zen4_or_newer();
1713        let golden_cove_detected = is_intel_golden_cove_or_newer();
1714        let apple_silicon_detected = is_apple_silicon();
1715        let neoverse_detected = is_neoverse_or_newer();
1716
1717        // These tests will pass as they just check the functions don't panic
1718        // Test passes if no panic occurs above
1719    }
1720
1721    #[test]
1722    fn test_enhanced_strategy_selector() {
1723        let selector = StrategySelector::default();
1724
1725        // Test that new strategies are included in default weights
1726        assert!(selector
1727            .strategy_weights
1728            .contains_key(&OptimizationStrategy::ModernArchOptimized));
1729        assert!(selector
1730            .strategy_weights
1731            .contains_key(&OptimizationStrategy::VectorOptimized));
1732        assert!(selector
1733            .strategy_weights
1734            .contains_key(&OptimizationStrategy::EnergyEfficient));
1735        assert!(selector
1736            .strategy_weights
1737            .contains_key(&OptimizationStrategy::HighThroughput));
1738
1739        // Test that ModernArchOptimized has higher initial weight
1740        let modern_weight = selector
1741            .strategy_weights
1742            .get(&OptimizationStrategy::ModernArchOptimized)
1743            .expect("Operation failed");
1744        let scalar_weight = selector
1745            .strategy_weights
1746            .get(&OptimizationStrategy::Scalar)
1747            .expect("Operation failed");
1748        assert!(modern_weight > scalar_weight);
1749    }
1750
1751    #[test]
1752    fn test_enhanced_strategy_selection() {
1753        let selector = StrategySelector::default();
1754
1755        // Test small operation strategy selection
1756        let small_strategy = selector.select_strategy(100, false);
1757        assert!(matches!(
1758            small_strategy,
1759            OptimizationStrategy::Scalar
1760                | OptimizationStrategy::EnergyEfficient
1761                | OptimizationStrategy::ModernArchOptimized
1762        ));
1763
1764        // Test large operation strategy selection
1765        let large_strategy = selector.select_strategy(1_000_000, false);
1766        assert!(matches!(
1767            large_strategy,
1768            OptimizationStrategy::HighThroughput
1769                | OptimizationStrategy::VectorOptimized
1770                | OptimizationStrategy::Parallel
1771        ));
1772
1773        // Test memory-bound operation strategy selection
1774        let memory_bound_strategy = selector.select_strategy(10_000, true);
1775        assert!(matches!(
1776            memory_bound_strategy,
1777            OptimizationStrategy::MemoryBound | OptimizationStrategy::ModernArchOptimized
1778        ));
1779    }
1780
1781    #[test]
1782    #[cfg(feature = "benchmarking")]
1783    fn test_advanced_benchmark_config() {
1784        let config = benchmarking::presets::advanced_comprehensive();
1785
1786        // Verify comprehensive strategy coverage
1787        assert!(config
1788            .strategies
1789            .contains(&OptimizationStrategy::ModernArchOptimized));
1790        assert!(config
1791            .strategies
1792            .contains(&OptimizationStrategy::VectorOptimized));
1793        assert!(config
1794            .strategies
1795            .contains(&OptimizationStrategy::EnergyEfficient));
1796        assert!(config
1797            .strategies
1798            .contains(&OptimizationStrategy::HighThroughput));
1799
1800        // Verify comprehensive size coverage
1801        assert!(config.sample_sizes.len() >= 10);
1802        assert!(config.sample_sizes.contains(&100));
1803        assert!(config.sample_sizes.contains(&5_000_000));
1804
1805        // Verify thorough measurement configuration
1806        assert!(config.measurement_iterations >= 25);
1807        assert!(config.warmup_iterations >= 10);
1808    }
1809
1810    #[test]
1811    #[cfg(feature = "benchmarking")]
1812    fn test_modern_architecture_benchmark_config() {
1813        let config = benchmarking::presets::modern_architectures();
1814
1815        // Verify focus on modern strategies
1816        assert_eq!(config.strategies.len(), 4);
1817        assert!(config
1818            .strategies
1819            .contains(&OptimizationStrategy::ModernArchOptimized));
1820        assert!(config
1821            .strategies
1822            .contains(&OptimizationStrategy::VectorOptimized));
1823        assert!(config
1824            .strategies
1825            .contains(&OptimizationStrategy::HighThroughput));
1826        assert!(config
1827            .strategies
1828            .contains(&OptimizationStrategy::EnergyEfficient));
1829
1830        // Should not contain basic strategies for focused testing
1831        assert!(!config.strategies.contains(&OptimizationStrategy::Scalar));
1832    }
1833
1834    #[test]
1835    fn test_enhanced_cache_line_detection() {
1836        let optimizer = AdaptiveOptimizer::new();
1837        let cache_line_size = optimizer.cache_line_size;
1838
1839        // Cache line size should be reasonable (typically 64 or 128 bytes)
1840        assert!(cache_line_size == 64 || cache_line_size == 128);
1841
1842        // Should be power of 2
1843        assert_eq!(cache_line_size & (cache_line_size - 1), 0);
1844    }
1845
1846    #[test]
1847    fn test_strategy_weight_updates() {
1848        let mut selector = StrategySelector::default();
1849        let initial_weight = *selector
1850            .strategy_weights
1851            .get(&OptimizationStrategy::ModernArchOptimized)
1852            .expect("Operation failed");
1853
1854        // Update with good performance score
1855        selector.update_weights(OptimizationStrategy::ModernArchOptimized, 0.9);
1856        let updated_weight = *selector
1857            .strategy_weights
1858            .get(&OptimizationStrategy::ModernArchOptimized)
1859            .expect("Operation failed");
1860
1861        // Weight should have been adjusted based on learning
1862        assert_ne!(initial_weight, updated_weight);
1863    }
1864}
1865*/