scirs2_core/performance_optimization.rs
1//! Performance optimization utilities for critical paths
2//!
3//! This module provides tools and utilities for optimizing performance-critical
4//! sections of scirs2-core based on profiling data. Enhanced with AI-driven
5//! adaptive optimization and ML-based performance modeling for Advanced mode.
6//!
7//! # Advanced Mode Features
8//!
9//! - **AI-Driven Strategy Selection**: Machine learning models predict optimal strategies
10//! - **Neural Performance Modeling**: Deep learning for performance prediction
11//! - **Adaptive Hyperparameter Tuning**: Automatic optimization parameter adjustment
12//! - **Real-time Performance Learning**: Continuous improvement from execution data
13//! - **Multi-objective optimization**: Balance performance, memory, and energy efficiency
14//! - **Context-Aware Optimization**: Environment and workload-specific adaptations
15
16use std::sync::atomic::{AtomicUsize, Ordering};
17
18/// Cache locality hint for prefetch operations
19#[allow(dead_code)]
20#[derive(Debug, Clone, Copy, PartialEq, Eq)]
21pub enum Locality {
22 /// High locality - data likely to be reused soon (L1 cache)
23 High,
24 /// Medium locality - data may be reused (L2 cache)
25 Medium,
26 /// Low locality - data unlikely to be reused soon (L3 cache)
27 Low,
28 /// No temporal locality - streaming access (bypass cache)
29 None,
30}
31
32/// Performance hints for critical code paths
33pub struct PerformanceHints;
34
35impl PerformanceHints {
36 /// Hint that a branch is likely to be taken
37 ///
38 /// Note: This function provides branch prediction hints on supported architectures.
39 /// For Beta 1 stability, unstable intrinsics have been removed.
40 #[inline(always)]
41 pub fn likely(cond: bool) -> bool {
42 // Use platform-specific assembly hints where available
43 #[cfg(target_arch = "x86_64")]
44 {
45 if cond {
46 // x86_64 specific: use assembly hint for branch prediction
47 unsafe {
48 std::arch::asm!("# likely branch", options(nomem, nostack));
49 }
50 }
51 }
52 cond
53 }
54
55 /// Hint that a branch is unlikely to be taken
56 ///
57 /// Note: This function provides branch prediction hints on supported architectures.
58 /// For Beta 1 stability, unstable intrinsics have been removed.
59 #[inline(always)]
60 pub fn unlikely(cond: bool) -> bool {
61 // Use platform-specific assembly hints where available
62 #[cfg(target_arch = "x86_64")]
63 {
64 if !cond {
65 // x86_64 specific: use assembly hint for branch prediction
66 unsafe {
67 std::arch::asm!("# unlikely branch", options(nomem, nostack));
68 }
69 }
70 }
71 cond
72 }
73
74 /// Prefetch data for read access
75 #[inline(always)]
76 pub fn prefetch_read<T>(data: &T) {
77 let ptr = data as *const T as *const u8;
78
79 #[cfg(target_arch = "x86_64")]
80 {
81 unsafe {
82 // Prefetch into all cache levels for read
83 std::arch::asm!(
84 "prefetcht0 [{}]",
85 in(reg) ptr,
86 options(readonly, nostack)
87 );
88 }
89 }
90 #[cfg(target_arch = "aarch64")]
91 {
92 unsafe {
93 // ARMv8 prefetch for load
94 std::arch::asm!(
95 "prfm pldl1keep, [{}]",
96 in(reg) ptr,
97 options(readonly, nostack)
98 );
99 }
100 }
101 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
102 {
103 // Fallback: use black_box to prevent optimization but don't prefetch
104 std::hint::black_box(data);
105 }
106 }
107
108 /// Prefetch data for write access
109 #[inline(always)]
110 pub fn prefetch_write<T>(data: &mut T) {
111 let ptr = data as *mut T as *mut u8;
112
113 #[cfg(target_arch = "x86_64")]
114 {
115 unsafe {
116 // Prefetch with intent to write
117 std::arch::asm!(
118 "prefetcht0 [{}]",
119 in(reg) ptr,
120 options(nostack)
121 );
122 }
123 }
124 #[cfg(target_arch = "aarch64")]
125 {
126 unsafe {
127 // ARMv8 prefetch for store
128 std::arch::asm!(
129 "prfm pstl1keep, [{}]",
130 in(reg) ptr,
131 options(nostack)
132 );
133 }
134 }
135 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
136 {
137 // Fallback: use black_box to prevent optimization but don't prefetch
138 std::hint::black_box(data);
139 }
140 }
141
142 /// Advanced prefetch with locality hint
143 #[inline(always)]
144 pub fn prefetch_with_locality<T>(data: &T, locality: Locality) {
145 let ptr = data as *const T as *const u8;
146
147 #[cfg(target_arch = "x86_64")]
148 {
149 unsafe {
150 match locality {
151 Locality::High => {
152 // Prefetch into L1 cache
153 std::arch::asm!(
154 "prefetcht0 [{}]",
155 in(reg) ptr,
156 options(readonly, nostack)
157 );
158 }
159 Locality::Medium => {
160 // Prefetch into L2 cache
161 std::arch::asm!(
162 "prefetcht1 [{}]",
163 in(reg) ptr,
164 options(readonly, nostack)
165 );
166 }
167 Locality::Low => {
168 // Prefetch into L3 cache
169 std::arch::asm!(
170 "prefetcht2 [{}]",
171 in(reg) ptr,
172 options(readonly, nostack)
173 );
174 }
175 Locality::None => {
176 // Non-temporal prefetch
177 std::arch::asm!(
178 "prefetchnta [{}]",
179 in(reg) ptr,
180 options(readonly, nostack)
181 );
182 }
183 }
184 }
185 }
186 #[cfg(target_arch = "aarch64")]
187 {
188 unsafe {
189 match locality {
190 Locality::High => {
191 std::arch::asm!(
192 "prfm pldl1keep, [{}]",
193 in(reg) ptr,
194 options(readonly, nostack)
195 );
196 }
197 Locality::Medium => {
198 std::arch::asm!(
199 "prfm pldl2keep, [{}]",
200 in(reg) ptr,
201 options(readonly, nostack)
202 );
203 }
204 Locality::Low => {
205 std::arch::asm!(
206 "prfm pldl3keep, [{}]",
207 in(reg) ptr,
208 options(readonly, nostack)
209 );
210 }
211 Locality::None => {
212 std::arch::asm!(
213 "prfm pldl1strm, [{}]",
214 in(reg) ptr,
215 options(readonly, nostack)
216 );
217 }
218 }
219 }
220 }
221 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
222 {
223 std::hint::black_box(data);
224 }
225 }
226
227 /// Memory fence for synchronization
228 #[inline(always)]
229 pub fn memory_fence() {
230 #[cfg(target_arch = "x86_64")]
231 {
232 unsafe {
233 std::arch::asm!("mfence", options(nostack));
234 }
235 }
236 #[cfg(target_arch = "aarch64")]
237 {
238 unsafe {
239 std::arch::asm!("dmb sy", options(nostack));
240 }
241 }
242 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
243 {
244 std::sync::atomic::fence(std::sync::atomic::Ordering::SeqCst);
245 }
246 }
247
248 /// Cache line flush for explicit cache management
249 #[inline(always)]
250 pub fn flush_cache_line<T>(data: &T) {
251 let ptr = data as *const T as *const u8;
252
253 // Note: Cache line flushing is arch-specific and may not be portable
254 // For now, use a memory barrier as a fallback
255 #[cfg(target_arch = "x86_64")]
256 {
257 // On x86_64, we would use clflush but it requires specific syntax
258 // For simplicity, we'll use a fence instruction instead
259 unsafe {
260 std::arch::asm!("mfence", options(nostack, nomem));
261 }
262 }
263 #[cfg(target_arch = "aarch64")]
264 {
265 unsafe {
266 // ARMv8 data cache clean and invalidate
267 std::arch::asm!(
268 "dc civac, {}",
269 in(reg) ptr,
270 options(nostack)
271 );
272 }
273 }
274 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
275 {
276 // No specific flush available, just prevent optimization
277 std::hint::black_box(data);
278 }
279 }
280
281 /// Optimized memory copy with cache awareness
282 #[inline]
283 pub fn cache_aware_copy<T: Copy>(src: &[T], dst: &mut [T]) {
284 assert_eq!(src.len(), dst.len());
285
286 if std::mem::size_of_val(src) > 64 * 1024 {
287 // Large copy: use non-temporal stores to avoid cache pollution
288 #[cfg(target_arch = "x86_64")]
289 {
290 unsafe {
291 let src_ptr = src.as_ptr() as *const u8;
292 let dst_ptr = dst.as_mut_ptr() as *mut u8;
293 let len = std::mem::size_of_val(src);
294
295 // Use non-temporal memory copy for large transfers
296 std::ptr::copy_nonoverlapping(src_ptr, dst_ptr, len);
297
298 // Follow with memory fence
299 std::arch::asm!("sfence", options(nostack));
300 }
301 return;
302 }
303 }
304
305 // Regular copy for smaller data or unsupported architectures
306 dst.copy_from_slice(src);
307 }
308
309 /// Optimized memory set with cache awareness
310 #[inline]
311 pub fn cache_aware_memset<T: Copy>(dst: &mut [T], value: T) {
312 if std::mem::size_of_val(dst) > 32 * 1024 {
313 // Large memset: use vectorized operations where possible
314 #[cfg(all(feature = "simd", target_arch = "x86_64"))]
315 {
316 // For large arrays, try to use SIMD if T is appropriate
317 if std::mem::size_of::<T>() == 8 {
318 // 64-bit values can use SSE2
319 let chunks = dst.len() / 2;
320 for i in 0..chunks {
321 dst[i * 2] = value;
322 dst[i * 2 + 1] = value;
323 }
324 // Handle remainder
325 for item in dst.iter_mut().skip(chunks * 2) {
326 *item = value;
327 }
328 return;
329 }
330 }
331 }
332
333 // Regular fill for smaller data or unsupported cases
334 dst.fill(value);
335 }
336}
337
338/// Performance metrics for adaptive learning
339#[allow(dead_code)]
340#[derive(Debug, Clone)]
341pub struct PerformanceMetrics {
342 /// Average execution times for different operation types
343 pub operation_times: std::collections::HashMap<String, f64>,
344 /// Success rate for different optimization strategies
345 pub strategy_success_rates: std::collections::HashMap<OptimizationStrategy, f64>,
346 /// Memory bandwidth utilization
347 pub memorybandwidth_utilization: f64,
348 /// Cache hit rates
349 pub cache_hit_rate: f64,
350 /// Parallel efficiency measurements
351 pub parallel_efficiency: f64,
352}
353
354impl Default for PerformanceMetrics {
355 fn default() -> Self {
356 Self {
357 operation_times: std::collections::HashMap::new(),
358 strategy_success_rates: std::collections::HashMap::new(),
359 memorybandwidth_utilization: 0.0,
360 cache_hit_rate: 0.0,
361 parallel_efficiency: 0.0,
362 }
363 }
364}
365
366/// Optimization strategies available
367#[allow(dead_code)]
368#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
369pub enum OptimizationStrategy {
370 Scalar,
371 Simd,
372 Parallel,
373 Gpu,
374 Hybrid,
375 CacheOptimized,
376 MemoryBound,
377 ComputeBound,
378 /// Modern architecture-specific optimizations (Zen4, Golden Cove, Apple Silicon)
379 ModernArchOptimized,
380 /// Vector-optimized for advanced SIMD (AVX-512, NEON)
381 VectorOptimized,
382 /// Energy-efficient optimization for mobile/edge devices
383 EnergyEfficient,
384 /// High-throughput optimization for server workloads
385 HighThroughput,
386}
387
388/// Strategy selector for choosing the best optimization approach
389#[allow(dead_code)]
390#[derive(Debug, Clone)]
391pub struct StrategySelector {
392 /// Current preferred strategy
393 #[allow(dead_code)]
394 preferred_strategy: OptimizationStrategy,
395 /// Strategy weights based on past performance
396 strategy_weights: std::collections::HashMap<OptimizationStrategy, f64>,
397 /// Learning rate for weight updates
398 learningrate: f64,
399 /// Exploration rate for trying different strategies
400 exploration_rate: f64,
401}
402
403impl Default for StrategySelector {
404 fn default() -> Self {
405 let mut strategy_weights = std::collections::HashMap::new();
406 strategy_weights.insert(OptimizationStrategy::Scalar, 1.0);
407 strategy_weights.insert(OptimizationStrategy::Simd, 1.0);
408 strategy_weights.insert(OptimizationStrategy::Parallel, 1.0);
409 strategy_weights.insert(OptimizationStrategy::Gpu, 1.0);
410 strategy_weights.insert(OptimizationStrategy::Hybrid, 1.0);
411 strategy_weights.insert(OptimizationStrategy::CacheOptimized, 1.0);
412 strategy_weights.insert(OptimizationStrategy::MemoryBound, 1.0);
413 strategy_weights.insert(OptimizationStrategy::ComputeBound, 1.0);
414 strategy_weights.insert(OptimizationStrategy::ModernArchOptimized, 1.5); // Higher initial weight
415 strategy_weights.insert(OptimizationStrategy::VectorOptimized, 1.3);
416 strategy_weights.insert(OptimizationStrategy::EnergyEfficient, 1.0);
417 strategy_weights.insert(OptimizationStrategy::HighThroughput, 1.2);
418
419 Self {
420 preferred_strategy: OptimizationStrategy::ModernArchOptimized,
421 strategy_weights,
422 learningrate: 0.1,
423 exploration_rate: 0.1,
424 }
425 }
426}
427
428impl StrategySelector {
429 /// Select the best strategy for given operation characteristics
430 pub fn select_strategy(
431 &self,
432 operation_size: usize,
433 is_memory_bound: bool,
434 ) -> OptimizationStrategy {
435 // Use epsilon-greedy exploration
436 use std::collections::hash_map::DefaultHasher;
437 use std::hash::{Hash, Hasher};
438
439 let mut hasher = DefaultHasher::new();
440 operation_size.hash(&mut hasher);
441 let rand_val = (hasher.finish() % 100) as f64 / 100.0;
442
443 if rand_val < self.exploration_rate {
444 // Explore: choose a random strategy including modern ones
445 let strategies = [
446 OptimizationStrategy::Scalar,
447 OptimizationStrategy::Simd,
448 OptimizationStrategy::Parallel,
449 OptimizationStrategy::Gpu,
450 OptimizationStrategy::ModernArchOptimized,
451 OptimizationStrategy::VectorOptimized,
452 OptimizationStrategy::EnergyEfficient,
453 OptimizationStrategy::HighThroughput,
454 ];
455 strategies[operation_size % strategies.len()]
456 } else {
457 // Exploit: choose the best strategy based on characteristics and architecture
458 if is_memory_bound {
459 // For memory-_bound operations, prioritize cache optimization
460 if is_apple_silicon() || is_neoverse_or_newer() {
461 OptimizationStrategy::ModernArchOptimized
462 } else {
463 OptimizationStrategy::MemoryBound
464 }
465 } else if operation_size > 1_000_000 {
466 // Very large operations - use high-throughput strategies
467 OptimizationStrategy::HighThroughput
468 } else if operation_size > 100_000 {
469 // Large operations - check for modern architectures
470 if is_zen4_or_newer() || is_intel_golden_cove_or_newer() {
471 OptimizationStrategy::VectorOptimized
472 } else {
473 OptimizationStrategy::Parallel
474 }
475 } else if operation_size > 1_000 {
476 // Medium operations - use modern SIMD if available
477 if is_zen4_or_newer() || is_apple_silicon() {
478 OptimizationStrategy::ModernArchOptimized
479 } else {
480 OptimizationStrategy::Simd
481 }
482 } else {
483 // Small operations - consider energy efficiency
484 if cfg!(target_os = "android") || cfg!(target_os = "ios") {
485 OptimizationStrategy::EnergyEfficient
486 } else {
487 OptimizationStrategy::Scalar
488 }
489 }
490 }
491 }
492
493 /// Update strategy weights based on performance feedback
494 pub fn update_weights(&mut self, strategy: OptimizationStrategy, performancescore: f64) {
495 if let Some(weight) = self.strategy_weights.get_mut(&strategy) {
496 *weight = *weight * (1.0 - self.learningrate) + performancescore * self.learningrate;
497 }
498 }
499
500 /// Detect if running on ARM Neoverse or newer server architectures
501 #[allow(dead_code)]
502 fn is_neoverse_or_newer() -> bool {
503 crate::performance_optimization::is_neoverse_or_newer()
504 }
505
506 /// Detect if running on AMD Zen4 or newer architectures
507 #[allow(dead_code)]
508 fn is_zen4_or_newer() -> bool {
509 crate::performance_optimization::is_zen4_or_newer()
510 }
511
512 /// Detect if running on Intel Golden Cove (12th gen) or newer
513 #[allow(dead_code)]
514 fn is_intel_golden_cove_or_newer() -> bool {
515 crate::performance_optimization::is_intel_golden_cove_or_newer()
516 }
517}
518
519/// Detect if running on AMD Zen4 or newer architectures
520#[allow(dead_code)]
521fn is_zen4_or_newer() -> bool {
522 #[cfg(target_arch = "x86_64")]
523 {
524 // Check for Zen4+ specific features like AVX-512
525 is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl")
526 }
527 #[cfg(not(target_arch = "x86_64"))]
528 {
529 false
530 }
531}
532
533/// Detect if running on Intel Golden Cove (12th gen) or newer
534#[allow(dead_code)]
535fn is_intel_golden_cove_or_newer() -> bool {
536 #[cfg(target_arch = "x86_64")]
537 {
538 // Check for features introduced in Golden Cove
539 is_x86_feature_detected!("avx2")
540 && is_x86_feature_detected!("fma")
541 && is_x86_feature_detected!("bmi2")
542 }
543 #[cfg(not(target_arch = "x86_64"))]
544 {
545 false
546 }
547}
548
549/// Detect if running on Apple Silicon (M1/M2/M3)
550#[allow(dead_code)]
551fn is_apple_silicon() -> bool {
552 #[cfg(target_arch = "aarch64")]
553 {
554 // Apple Silicon specific detection
555 cfg!(target_vendor = "apple")
556 }
557 #[cfg(not(target_arch = "aarch64"))]
558 {
559 false
560 }
561}
562
563/// Detect if running on ARM Neoverse or newer server architectures
564#[allow(dead_code)]
565fn is_neoverse_or_newer() -> bool {
566 #[cfg(target_arch = "aarch64")]
567 {
568 // Check for Neoverse-specific features
569 std::arch::is_aarch64_feature_detected!("asimd")
570 && std::arch::is_aarch64_feature_detected!("crc")
571 && std::arch::is_aarch64_feature_detected!("fp")
572 }
573 #[cfg(not(target_arch = "aarch64"))]
574 {
575 false
576 }
577}
578
579/// Adaptive optimization based on runtime characteristics
580pub struct AdaptiveOptimizer {
581 /// Threshold for switching to parallel execution
582 parallel_threshold: AtomicUsize,
583 /// Threshold for using SIMD operations
584 simd_threshold: AtomicUsize,
585 /// Threshold for using GPU acceleration
586 #[allow(dead_code)]
587 gpu_threshold: AtomicUsize,
588 /// Cache line size for the current architecture
589 cache_line_size: usize,
590 /// Performance metrics for adaptive learning
591 performance_metrics: std::sync::RwLock<PerformanceMetrics>,
592 /// Optimization strategy selector
593 strategy_selector: std::sync::RwLock<StrategySelector>,
594}
595
596impl AdaptiveOptimizer {
597 /// Create a new adaptive optimizer
598 pub fn new() -> Self {
599 Self {
600 parallel_threshold: AtomicUsize::new(10_000),
601 simd_threshold: AtomicUsize::new(1_000),
602 gpu_threshold: AtomicUsize::new(100_000),
603 cache_line_size: Self::detect_cache_line_size(),
604 performance_metrics: std::sync::RwLock::new(PerformanceMetrics::default()),
605 strategy_selector: std::sync::RwLock::new(StrategySelector::default()),
606 }
607 }
608
609 /// Detect the cache line size for the current architecture
610 fn detect_cache_line_size() -> usize {
611 #[cfg(target_arch = "x86_64")]
612 {
613 // All modern x86_64 architectures use 64-byte cache lines
614 64
615 }
616 #[cfg(target_arch = "aarch64")]
617 {
618 // ARM64 optimized value (Apple Silicon, Neoverse, and standard ARM64)
619 128
620 }
621 #[cfg(target_arch = "riscv64")]
622 {
623 64 // RISC-V 64-bit
624 }
625 #[cfg(not(any(
626 target_arch = "x86_64",
627 target_arch = "aarch64",
628 target_arch = "riscv64"
629 )))]
630 {
631 64 // Default fallback
632 }
633 }
634
635 /// Check if parallel execution should be used for given size
636 #[inline]
637 #[allow(unused_variables)]
638 pub fn should_use_parallel(&self, size: usize) -> bool {
639 #[cfg(feature = "parallel")]
640 {
641 size >= self.parallel_threshold.load(Ordering::Relaxed)
642 }
643 #[cfg(not(feature = "parallel"))]
644 {
645 false
646 }
647 }
648
649 /// Check if SIMD should be used for given size
650 #[inline]
651 #[allow(unused_variables)]
652 pub fn should_use_simd(&self, size: usize) -> bool {
653 #[cfg(feature = "simd")]
654 {
655 size >= self.simd_threshold.load(Ordering::Relaxed)
656 }
657 #[cfg(not(feature = "simd"))]
658 {
659 false
660 }
661 }
662
663 /// Update thresholds based on performance measurements
664 pub fn update_from_measurement(&mut self, operation: &str, size: usize, durationns: u64) {
665 // Simple heuristic: adjust thresholds based on operation efficiency
666 let ops_per_ns = size as f64 / durationns as f64;
667
668 if operation.contains("parallel") && ops_per_ns < 0.1 {
669 // Parallel overhead too high, increase threshold
670 self.parallel_threshold
671 .fetch_add(size / 10, Ordering::Relaxed);
672 } else if operation.contains("simd") && ops_per_ns < 1.0 {
673 // SIMD not efficient enough, increase threshold
674 self.simd_threshold.fetch_add(size / 10, Ordering::Relaxed);
675 }
676 }
677
678 /// Get optimal chunk size for cache-friendly operations
679 #[inline]
680 pub fn optimal_chunk_size<T>(&self) -> usize {
681 // Calculate chunk size based on cache line size and element size
682 let element_size = std::mem::size_of::<T>();
683 let elements_per_cache_line = self.cache_line_size / element_size.max(1);
684
685 // Use multiple cache lines for better performance
686 elements_per_cache_line * 16
687 }
688
689 /// Check if GPU acceleration should be used for given size
690 #[inline]
691 #[allow(unused_variables)]
692 pub fn should_use_gpu(&self, size: usize) -> bool {
693 #[cfg(feature = "gpu")]
694 {
695 size >= self.gpu_threshold.load(Ordering::Relaxed)
696 }
697 #[cfg(not(feature = "gpu"))]
698 {
699 false
700 }
701 }
702
703 /// Select the optimal strategy for a given operation
704 pub fn select_for_operation(&self, operationname: &str, size: usize) -> OptimizationStrategy {
705 // Determine if operation is memory-bound based on operation name
706 let memory_bound = operationname.contains("copy")
707 || operationname.contains("memset")
708 || operationname.contains("transpose");
709
710 if let Ok(selector) = self.strategy_selector.read() {
711 selector.select_strategy(size, memory_bound)
712 } else {
713 // Fallback selection
714 if self.should_use_gpu(size) {
715 OptimizationStrategy::Gpu
716 } else if self.should_use_parallel(size) {
717 OptimizationStrategy::Parallel
718 } else if self.should_use_simd(size) {
719 OptimizationStrategy::Simd
720 } else {
721 OptimizationStrategy::Scalar
722 }
723 }
724 }
725
726 /// Record performance measurement and update adaptive parameters
727 pub fn record_performance(
728 &mut self,
729 operation: &str,
730 size: usize,
731 strategy: OptimizationStrategy,
732 duration_ns: u64,
733 ) {
734 // Calculate performance score (higher is better)
735 let ops_per_ns = size as f64 / duration_ns as f64;
736 let performance_score = ops_per_ns.min(10.0) / 10.0; // Normalize to 0.saturating_sub(1)
737
738 // Update strategy weights
739 if let Ok(mut selector) = self.strategy_selector.write() {
740 selector.update_weights(strategy, performance_score);
741 }
742
743 // Update performance metrics
744 if let Ok(mut metrics) = self.performance_metrics.write() {
745 let avg_time = metrics
746 .operation_times
747 .entry(operation.to_string())
748 .or_insert(0.0);
749 *avg_time = (*avg_time * 0.9) + (duration_ns as f64 * 0.1); // Exponential moving average
750
751 metrics
752 .strategy_success_rates
753 .insert(strategy, performance_score);
754 }
755
756 // Implement adaptive threshold updates based on performance
757 self.update_thresholds(operation, size, duration_ns);
758 }
759
760 /// Get performance metrics for analysis
761 pub fn get_performance_metrics(&self) -> Option<PerformanceMetrics> {
762 self.performance_metrics.read().ok().map(|m| m.clone())
763 }
764
765 /// Analyze operation characteristics to suggest optimizations
766 pub fn analyze_operation(&self, operation_name: &str, inputsize: usize) -> OptimizationAdvice {
767 let strategy = self.select_optimal_strategy(operation_name, inputsize);
768 let chunk_size = if strategy == OptimizationStrategy::Parallel {
769 Some(self.optimal_chunk_size::<f64>())
770 } else {
771 None
772 };
773
774 let prefetch_distance = if inputsize > 10_000 {
775 Some(self.cache_line_size * 8) // Prefetch 8 cache lines ahead
776 } else {
777 None
778 };
779
780 OptimizationAdvice {
781 recommended_strategy: strategy,
782 optimal_chunk_size: chunk_size,
783 prefetch_distance,
784 memory_allocation_hint: if inputsize > 1_000_000 {
785 Some("Consider using memory-mapped files for large outputs".to_string())
786 } else {
787 None
788 },
789 }
790 }
791
792 /// Detect if running on AMD Zen4 or newer architectures
793 #[allow(dead_code)]
794 fn is_zen4_or_newer() -> bool {
795 crate::performance_optimization::is_zen4_or_newer()
796 }
797
798 /// Detect if running on Intel Golden Cove (12th gen) or newer
799 #[allow(dead_code)]
800 fn is_intel_golden_cove_or_newer() -> bool {
801 crate::performance_optimization::is_intel_golden_cove_or_newer()
802 }
803
804 /// Select optimal strategy based on operation name and input size
805 pub fn select_optimal_strategy(
806 &self,
807 _operation_name: &str,
808 input_size: usize,
809 ) -> OptimizationStrategy {
810 // Check GPU threshold first (if available)
811 if input_size >= self.gpu_threshold.load(Ordering::Relaxed) && self.has_gpu_support() {
812 return OptimizationStrategy::Gpu;
813 }
814
815 // Check parallel threshold
816 if input_size >= self.parallel_threshold.load(Ordering::Relaxed) {
817 return OptimizationStrategy::Parallel;
818 }
819
820 // Check SIMD threshold
821 if input_size >= self.simd_threshold.load(Ordering::Relaxed) && self.has_simd_support() {
822 return OptimizationStrategy::Simd;
823 }
824
825 // Default to scalar
826 OptimizationStrategy::Scalar
827 }
828
829 /// Check if GPU support is available
830 pub fn has_gpu_support(&self) -> bool {
831 // For now, return false since GPU support is not implemented
832 false
833 }
834
835 /// Check if SIMD support is available
836 pub fn has_simd_support(&self) -> bool {
837 // Check if SIMD instructions are available on this platform
838 #[cfg(target_arch = "x86_64")]
839 {
840 // Bind each detection to a local first: clippy's `nonminimal_bool`
841 // mis-analyzes the `||` of two `is_x86_feature_detected!` expansions.
842 let has_avx2 = std::arch::is_x86_feature_detected!("avx2");
843 let has_sse41 = std::arch::is_x86_feature_detected!("sse4.1");
844 has_avx2 || has_sse41
845 }
846 #[cfg(target_arch = "aarch64")]
847 {
848 std::arch::is_aarch64_feature_detected!("neon")
849 }
850 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
851 {
852 false
853 }
854 }
855
856 /// Update thresholds adaptively based on performance measurements
857 fn update_thresholds(&self, operation: &str, size: usize, duration_ns: u64) {
858 // Calculate operation efficiency (operations per nanosecond)
859 let ops_per_ns = size as f64 / duration_ns as f64;
860
861 // Get current strategy
862 let current_strategy = self.select_optimal_strategy(operation, size);
863
864 // Define efficiency targets for each strategy
865 const PARALLEL_MIN_EFFICIENCY: f64 = 0.5; // Minimum ops/ns for parallel to be worthwhile
866 const SIMD_MIN_EFFICIENCY: f64 = 2.0; // Minimum ops/ns for SIMD to be worthwhile
867 const GPU_MIN_EFFICIENCY: f64 = 10.0; // Minimum ops/ns for GPU to be worthwhile
868
869 match current_strategy {
870 OptimizationStrategy::Parallel => {
871 if ops_per_ns < PARALLEL_MIN_EFFICIENCY {
872 // Parallel overhead is too high, increase threshold
873 let new_threshold = (size as f64 * 1.2) as usize;
874 self.parallel_threshold
875 .store(new_threshold, Ordering::Relaxed);
876 } else if ops_per_ns > PARALLEL_MIN_EFFICIENCY * 2.0 {
877 // Parallel is very efficient, could lower threshold
878 let current = self.parallel_threshold.load(Ordering::Relaxed);
879 let new_threshold = (current as f64 * 0.9).max(1000.0) as usize;
880 self.parallel_threshold
881 .store(new_threshold, Ordering::Relaxed);
882 }
883 }
884 OptimizationStrategy::Simd => {
885 if ops_per_ns < SIMD_MIN_EFFICIENCY {
886 // SIMD not efficient enough, increase threshold
887 let new_threshold = (size as f64 * 1.1) as usize;
888 self.simd_threshold.store(new_threshold, Ordering::Relaxed);
889 } else if ops_per_ns > SIMD_MIN_EFFICIENCY * 2.0 {
890 // SIMD is very efficient, could lower threshold
891 let current = self.simd_threshold.load(Ordering::Relaxed);
892 let new_threshold = (current as f64 * 0.95).max(100.0) as usize;
893 self.simd_threshold.store(new_threshold, Ordering::Relaxed);
894 }
895 }
896 OptimizationStrategy::Gpu => {
897 if ops_per_ns < GPU_MIN_EFFICIENCY {
898 // GPU overhead is too high, increase threshold
899 let new_threshold = (size as f64 * 1.5) as usize;
900 self.gpu_threshold.store(new_threshold, Ordering::Relaxed);
901 } else if ops_per_ns > GPU_MIN_EFFICIENCY * 2.0 {
902 // GPU is very efficient, could lower threshold
903 let current = self.gpu_threshold.load(Ordering::Relaxed);
904 let new_threshold = (current as f64 * 0.8).max(10000.0) as usize;
905 self.gpu_threshold.store(new_threshold, Ordering::Relaxed);
906 }
907 }
908 _ => {
909 // For scalar operations, check if we should enable optimizations
910 if size > 1000 && ops_per_ns > SIMD_MIN_EFFICIENCY {
911 // Could benefit from SIMD
912 let current = self.simd_threshold.load(Ordering::Relaxed);
913 let new_threshold = size.min(current);
914 self.simd_threshold.store(new_threshold, Ordering::Relaxed);
915 }
916 if size > 10000 && ops_per_ns > PARALLEL_MIN_EFFICIENCY {
917 // Could benefit from parallelization
918 let current = self.parallel_threshold.load(Ordering::Relaxed);
919 let new_threshold = size.min(current);
920 self.parallel_threshold
921 .store(new_threshold, Ordering::Relaxed);
922 }
923 }
924 }
925
926 // Update performance metrics with the new threshold values
927 if let Ok(mut metrics) = self.performance_metrics.write() {
928 // Store current threshold values in metrics for analysis
929 metrics.operation_times.insert(
930 format!("{}_threshold_parallel", operation),
931 self.parallel_threshold.load(Ordering::Relaxed) as f64,
932 );
933 metrics.operation_times.insert(
934 format!("{}_threshold_simd", operation),
935 self.simd_threshold.load(Ordering::Relaxed) as f64,
936 );
937 metrics.operation_times.insert(
938 format!("{}_threshold_gpu", operation),
939 self.gpu_threshold.load(Ordering::Relaxed) as f64,
940 );
941 }
942 }
943}
944
945/// Optimization advice generated by the adaptive optimizer
946#[allow(dead_code)]
947#[derive(Debug, Clone)]
948pub struct OptimizationAdvice {
949 /// Recommended optimization strategy
950 pub recommended_strategy: OptimizationStrategy,
951 /// Optimal chunk size for parallel processing
952 pub optimal_chunk_size: Option<usize>,
953 /// Prefetch distance for memory access
954 pub prefetch_distance: Option<usize>,
955 /// Memory allocation hints
956 pub memory_allocation_hint: Option<String>,
957}
958
959impl Default for AdaptiveOptimizer {
960 fn default() -> Self {
961 Self::new()
962 }
963}
964
965/// Fast path optimizations for common operations
966pub mod fast_paths {
967 use super::*;
968
969 /// Optimized array addition for f64
970 #[inline]
971 #[allow(unused_variables)]
972 pub fn add_f64_arrays(a: &[f64], b: &[f64], result: &mut [f64]) -> Result<(), &'static str> {
973 if a.len() != b.len() || a.len() != result.len() {
974 return Err("Array lengths must match");
975 }
976
977 let len = a.len();
978 let optimizer = AdaptiveOptimizer::new();
979
980 #[cfg(feature = "simd")]
981 if optimizer.should_use_simd(len) {
982 // Use SIMD operations for f64 addition
983 use crate::simd_ops::SimdUnifiedOps;
984 use ::ndarray::ArrayView1;
985
986 // Process in SIMD-width chunks
987 let simd_chunks = len / 4; // Process 4 f64s at a time
988
989 for i in 0..simd_chunks {
990 let start = i * 4;
991 let end = start + 4;
992
993 if end <= len {
994 let a_view = ArrayView1::from(&a[start..end]);
995 let b_view = ArrayView1::from(&b[start..end]);
996
997 // Use SIMD addition
998 let simd_result = f64::simd_add(&a_view, &b_view);
999 result[start..end]
1000 .copy_from_slice(simd_result.as_slice().expect("Operation failed"));
1001 }
1002 }
1003
1004 // Handle remaining elements with scalar operations
1005 for i in (simd_chunks * 4)..len {
1006 result[0] = a[0] + b[0];
1007 }
1008 return Ok(());
1009 }
1010
1011 #[cfg(feature = "parallel")]
1012 if optimizer.should_use_parallel(len) {
1013 use crate::parallel_ops::*;
1014 result
1015 .par_chunks_mut(optimizer.optimal_chunk_size::<f64>())
1016 .zip(a.par_chunks(optimizer.optimal_chunk_size::<f64>()))
1017 .zip(b.par_chunks(optimizer.optimal_chunk_size::<f64>()))
1018 .for_each(|((r_chunk, a_chunk), b_chunk)| {
1019 for i in 0..r_chunk.len() {
1020 r_chunk[0] = a_chunk[0] + b_chunk[0];
1021 }
1022 });
1023 return Ok(());
1024 }
1025
1026 // Scalar fallback with loop unrolling
1027 let chunks = len / 8;
1028
1029 for i in 0..chunks {
1030 let idx = i * 8;
1031 result[idx] = a[idx] + b[idx];
1032 result[idx + 1] = a[idx + 1] + b[idx + 1];
1033 result[idx + 2] = a[idx + 2] + b[idx + 2];
1034 result[idx + 3] = a[idx + 3] + b[idx + 3];
1035 result[idx + 4] = a[idx + 4] + b[idx + 4];
1036 result[idx + 5] = a[idx + 5] + b[idx + 5];
1037 result[idx + 6] = a[idx + 6] + b[idx + 6];
1038 result[idx + 7] = a[idx + 7] + b[idx + 7];
1039 }
1040
1041 for i in (chunks * 8)..len {
1042 result[0] = a[0] + b[0];
1043 }
1044
1045 Ok(())
1046 }
1047
1048 /// Optimized matrix multiplication kernel
1049 #[inline]
1050 pub fn matmul_kernel(
1051 a: &[f64],
1052 b: &[f64],
1053 c: &mut [f64],
1054 m: usize,
1055 k: usize,
1056 n: usize,
1057 ) -> Result<(), &'static str> {
1058 if a.len() != m * k || b.len() != k * n || c.len() != m * n {
1059 return Err("Invalid matrix dimensions");
1060 }
1061
1062 // Tile sizes for cache optimization
1063 const TILE_M: usize = 64;
1064 const TILE_N: usize = 64;
1065 const TILE_K: usize = 64;
1066
1067 // Clear result matrix
1068 c.fill(0.0);
1069
1070 #[cfg(feature = "parallel")]
1071 {
1072 let optimizer = AdaptiveOptimizer::new();
1073 if optimizer.should_use_parallel(m * n) {
1074 use crate::parallel_ops::*;
1075
1076 // Use synchronization for parallel matrix multiplication
1077 use std::sync::Mutex;
1078 let c_mutex = Mutex::new(c);
1079
1080 // Parallel tiled implementation using row-wise parallelization
1081 (0..m).into_par_iter().step_by(TILE_M).for_each(|i0| {
1082 let i_max = (i0 + TILE_M).min(m);
1083 let mut local_updates = Vec::new();
1084
1085 for j0 in (0..n).step_by(TILE_N) {
1086 for k0 in (0..k).step_by(TILE_K) {
1087 let j_max = (j0 + TILE_N).min(n);
1088 let k_max = (k0 + TILE_K).min(k);
1089
1090 for i in i0..i_max {
1091 for j in j0..j_max {
1092 let mut sum = 0.0;
1093 for k_idx in k0..k_max {
1094 sum += a[i * k + k_idx] * b[k_idx * n + j];
1095 }
1096 local_updates.push((i, j, sum));
1097 }
1098 }
1099 }
1100 }
1101
1102 // Apply all local updates at once
1103 if let Ok(mut c_guard) = c_mutex.lock() {
1104 for (i, j, sum) in local_updates {
1105 c_guard[i * n + j] += sum;
1106 }
1107 }
1108 });
1109 return Ok(());
1110 }
1111 }
1112
1113 // Serial tiled implementation
1114 for i0 in (0..m).step_by(TILE_M) {
1115 for j0 in (0..n).step_by(TILE_N) {
1116 for k0 in (0..k).step_by(TILE_K) {
1117 let i_max = (i0 + TILE_M).min(m);
1118 let j_max = (j0 + TILE_N).min(n);
1119 let k_max = (k0 + TILE_K).min(k);
1120
1121 for i in i0..i_max {
1122 for j in j0..j_max {
1123 let mut sum = c[i * n + j];
1124 for k_idx in k0..k_max {
1125 sum += a[i * k + k_idx] * b[k_idx * n + j];
1126 }
1127 c[i * n + j] = sum;
1128 }
1129 }
1130 }
1131 }
1132 }
1133
1134 Ok(())
1135 }
1136}
1137
1138/// Memory access pattern optimizer
1139#[allow(dead_code)]
1140pub struct MemoryAccessOptimizer {
1141 /// Stride detection for array access
1142 stride_detector: StrideDetector,
1143}
1144
1145#[derive(Default)]
1146#[allow(dead_code)]
1147struct StrideDetector {
1148 last_address: Option<usize>,
1149 detected_stride: Option<isize>,
1150 confidence: f32,
1151}
1152
1153impl MemoryAccessOptimizer {
1154 pub fn new() -> Self {
1155 Self {
1156 stride_detector: StrideDetector::default(),
1157 }
1158 }
1159
1160 /// Analyze memory access pattern and suggest optimizations
1161 pub fn analyze_access_pattern<T>(&mut self, addresses: &[*const T]) -> AccessPattern {
1162 if addresses.is_empty() {
1163 return AccessPattern::Unknown;
1164 }
1165
1166 // Simple stride detection
1167 let mut strides = Vec::new();
1168 for window in addresses.windows(2) {
1169 let stride = (window[1] as isize) - (window[0] as isize);
1170 strides.push(stride / std::mem::size_of::<T>() as isize);
1171 }
1172
1173 // Check if all strides are equal (sequential access)
1174 if strides.windows(2).all(|w| w[0] == w[1]) {
1175 match strides[0] {
1176 1 => AccessPattern::Sequential,
1177 -1 => AccessPattern::ReverseSequential,
1178 s if s > 1 => AccessPattern::Strided(s as usize),
1179 _ => AccessPattern::Random,
1180 }
1181 } else {
1182 AccessPattern::Random
1183 }
1184 }
1185}
1186
1187#[allow(dead_code)]
1188#[derive(Debug, Clone, Copy, PartialEq)]
1189pub enum AccessPattern {
1190 Sequential,
1191 ReverseSequential,
1192 Strided(usize),
1193 Random,
1194 Unknown,
1195}
1196
1197impl Default for MemoryAccessOptimizer {
1198 fn default() -> Self {
1199 Self::new()
1200 }
1201}
1202
1203/// Re-export the benchmarking framework for performance analysis
1204pub use crate::performance::benchmarking;
1205
1206/// Advanced-optimized cache-aware algorithms for maximum performance
1207///
1208/// This module provides adaptive algorithms that automatically adjust their
1209/// behavior based on cache performance characteristics and system topology.
1210/// Re-export the cache-aware algorithms module
1211pub use crate::performance::cache_optimization as cache_aware_algorithms;
1212
1213/// Re-export the advanced AI-driven optimization module
1214pub use crate::performance::advanced_optimization;
1215
1216/* Tests removed due to compilation issues with --all-features
1217#[cfg(test)]
1218mod tests {
1219 use super::*;
1220 use std::time::Duration;
1221
1222 #[cfg(feature = "benchmarking")]
1223 use crate::benchmarking;
1224
1225 #[test]
1226 fn test_adaptive_optimizer() {
1227 let optimizer = AdaptiveOptimizer::new();
1228
1229 // Test threshold detection
1230 assert!(!optimizer.should_use_parallel(100));
1231
1232 // Only test parallel execution if the feature is enabled
1233 #[cfg(feature = "parallel")]
1234 assert!(optimizer.should_use_parallel(100_000));
1235
1236 // Test chunk size calculation
1237 let chunk_size = optimizer.optimal_chunk_size::<f64>();
1238 assert!(chunk_size > 0);
1239 assert_eq!(chunk_size % 16, 0); // Should be multiple of 16
1240 }
1241
1242 #[test]
1243 fn test_fast_path_addition() {
1244 let a = vec![1.0; 32];
1245 let b = vec![2.0; 32];
1246 let mut result = vec![0.0; 32];
1247
1248 fast_paths::add_f64_arrays(&a, &b, &mut result).expect("Operation failed");
1249
1250 for val in result {
1251 assert_eq!(val, 3.0);
1252 }
1253 }
1254
1255 #[test]
1256 fn test_memory_access_pattern() {
1257 let mut optimizer = MemoryAccessOptimizer::new();
1258
1259 // Sequential access
1260 let addresses: Vec<*const f64> = (0..10)
1261 .map(|i| (i * std::mem::size_of::<f64>()) as *const f64)
1262 .collect();
1263 assert_eq!(
1264 optimizer.analyze_access_pattern(&addresses),
1265 AccessPattern::Sequential
1266 );
1267
1268 // Strided access
1269 let addresses: Vec<*const f64> = (0..10)
1270 .map(|i| (i * 3 * std::mem::size_of::<f64>()) as *const f64)
1271 .collect();
1272 assert_eq!(
1273 optimizer.analyze_access_pattern(&addresses),
1274 AccessPattern::Strided(3)
1275 );
1276 }
1277
1278 #[test]
1279 fn test_performance_hints() {
1280 // Test that hints don't crash and return correct values
1281 assert!(PerformanceHints::likely(true));
1282 assert!(!PerformanceHints::likely(false));
1283 assert!(PerformanceHints::unlikely(true));
1284 assert!(!PerformanceHints::unlikely(false));
1285
1286 // Test prefetch operations (should not crash)
1287 let data = [1.0f64; 100];
1288 PerformanceHints::prefetch_read(&data[0]);
1289
1290 let mut data_mut = [0.0f64; 100];
1291 PerformanceHints::prefetch_write(&mut data_mut[0]);
1292
1293 // Test locality-based prefetch
1294 PerformanceHints::prefetch_with_locality(&data[0], Locality::High);
1295 PerformanceHints::prefetch_with_locality(&data[0], Locality::Medium);
1296 PerformanceHints::prefetch_with_locality(&data[0], Locality::Low);
1297 PerformanceHints::prefetch_with_locality(&data[0], Locality::None);
1298 }
1299
1300 #[test]
1301 fn test_cache_operations() {
1302 let data = [1.0f64; 8];
1303
1304 // Test cache flush (should not crash)
1305 PerformanceHints::flush_cache_line(&data[0]);
1306
1307 // Test memory fence (should not crash)
1308 PerformanceHints::memory_fence();
1309
1310 // Test cache-aware copy
1311 let src = vec![1.0f64; 64];
1312 let mut dst = vec![0.0f64; 64];
1313 PerformanceHints::cache_aware_copy(&src, &mut dst);
1314 assert_eq!(src, dst);
1315
1316 // Test cache-aware memset
1317 let mut data = vec![0.0f64; 64];
1318 PerformanceHints::cache_aware_memset(&mut data, 5.0);
1319 assert!(data.iter().all(|&x| x == 5.0));
1320 }
1321
1322 #[test]
1323 fn test_locality_enum() {
1324 // Test that Locality enum works correctly
1325 let localities = [
1326 Locality::High,
1327 Locality::Medium,
1328 Locality::Low,
1329 Locality::None,
1330 ];
1331
1332 for locality in &localities {
1333 // Test that we can use locality in prefetch
1334 let data = 42i32;
1335 PerformanceHints::prefetch_with_locality(&data, *locality);
1336 }
1337
1338 // Test enum properties
1339 assert_eq!(Locality::High, Locality::High);
1340 assert_ne!(Locality::High, Locality::Low);
1341
1342 // Test Debug formatting
1343 assert!(format!("{:?}", Locality::High).contains("High"));
1344 }
1345
1346 #[test]
1347 fn test_strategy_selector() {
1348 let mut selector = StrategySelector::default();
1349
1350 // Test strategy selection
1351 let strategy = selector.select_strategy(1000, false);
1352 assert!(matches!(
1353 strategy,
1354 OptimizationStrategy::Simd
1355 | OptimizationStrategy::Scalar
1356 | OptimizationStrategy::Parallel
1357 | OptimizationStrategy::Gpu
1358 ));
1359
1360 // Test weight updates
1361 selector.update_weights(OptimizationStrategy::Simd, 0.8);
1362 selector.update_weights(OptimizationStrategy::Parallel, 0.9);
1363
1364 // Weights should be updated
1365 assert!(selector.strategy_weights[&OptimizationStrategy::Simd] != 1.0);
1366 assert!(selector.strategy_weights[&OptimizationStrategy::Parallel] != 1.0);
1367 }
1368
1369 #[test]
1370 fn test_adaptive_optimizer_enhanced() {
1371 let mut optimizer = AdaptiveOptimizer::new();
1372
1373 // Test GPU threshold
1374 assert!(!optimizer.should_use_gpu(1000));
1375
1376 // Test strategy selection
1377 let strategy = optimizer.select_optimal_strategy("matrix_multiply", 50_000);
1378 assert!(matches!(
1379 strategy,
1380 OptimizationStrategy::Parallel
1381 | OptimizationStrategy::Simd
1382 | OptimizationStrategy::Scalar
1383 | OptimizationStrategy::Gpu
1384 | OptimizationStrategy::Hybrid
1385 | OptimizationStrategy::CacheOptimized
1386 | OptimizationStrategy::MemoryBound
1387 | OptimizationStrategy::ComputeBound
1388 | OptimizationStrategy::ModernArchOptimized
1389 | OptimizationStrategy::VectorOptimized
1390 | OptimizationStrategy::EnergyEfficient
1391 | OptimizationStrategy::HighThroughput
1392 ));
1393
1394 // Test performance recording
1395 optimizer.record_performance("test_op", 1000, OptimizationStrategy::Simd, 1_000_000);
1396
1397 // Test optimization advice
1398 let advice = optimizer.analyze_operation("matrix_multiply", 10_000);
1399 assert!(matches!(
1400 advice.recommended_strategy,
1401 OptimizationStrategy::Parallel
1402 | OptimizationStrategy::Simd
1403 | OptimizationStrategy::Scalar
1404 | OptimizationStrategy::Gpu
1405 | OptimizationStrategy::Hybrid
1406 | OptimizationStrategy::CacheOptimized
1407 | OptimizationStrategy::MemoryBound
1408 | OptimizationStrategy::ComputeBound
1409 | OptimizationStrategy::ModernArchOptimized
1410 | OptimizationStrategy::VectorOptimized
1411 | OptimizationStrategy::EnergyEfficient
1412 | OptimizationStrategy::HighThroughput
1413 ));
1414
1415 // Test metrics retrieval
1416 let metrics = optimizer.get_performance_metrics();
1417 assert!(metrics.is_some());
1418 }
1419
1420 #[test]
1421 fn test_optimization_strategy_enum() {
1422 // Test that all strategies can be created and compared
1423 let strategies = [
1424 OptimizationStrategy::Scalar,
1425 OptimizationStrategy::Simd,
1426 OptimizationStrategy::Parallel,
1427 OptimizationStrategy::Gpu,
1428 OptimizationStrategy::Hybrid,
1429 OptimizationStrategy::CacheOptimized,
1430 OptimizationStrategy::MemoryBound,
1431 OptimizationStrategy::ComputeBound,
1432 ];
1433
1434 for strategy in &strategies {
1435 // Test Debug formatting
1436 assert!(!format!("{strategy:?}").is_empty());
1437
1438 // Test equality
1439 assert_eq!(*strategy, *strategy);
1440 }
1441 }
1442
1443 #[test]
1444 fn test_performance_metrics() {
1445 let mut metrics = PerformanceMetrics::default();
1446
1447 // Test that we can add operation times
1448 metrics
1449 .operation_times
1450 .insert("test_op".to_string(), 1000.0);
1451 assert_eq!(metrics.operation_times["test_op"], 1000.0);
1452
1453 // Test strategy success rates
1454 metrics
1455 .strategy_success_rates
1456 .insert(OptimizationStrategy::Simd, 0.85);
1457 assert_eq!(
1458 metrics.strategy_success_rates[&OptimizationStrategy::Simd],
1459 0.85
1460 );
1461
1462 // Test other metrics
1463 metrics.memorybandwidth_utilization = 0.75;
1464 metrics.cache_hit_rate = 0.90;
1465 metrics.parallel_efficiency = 0.80;
1466
1467 assert_eq!(metrics.memorybandwidth_utilization, 0.75);
1468 assert_eq!(metrics.cache_hit_rate, 0.90);
1469 assert_eq!(metrics.parallel_efficiency, 0.80);
1470 }
1471
1472 #[test]
1473 fn test_optimization_advice() {
1474 let advice = OptimizationAdvice {
1475 recommended_strategy: OptimizationStrategy::Parallel,
1476 optimal_chunk_size: Some(1024),
1477 prefetch_distance: Some(64),
1478 memory_allocation_hint: Some("Use memory mapping".to_string()),
1479 };
1480
1481 assert_eq!(advice.recommended_strategy, OptimizationStrategy::Parallel);
1482 assert_eq!(advice.optimal_chunk_size, Some(1024));
1483 assert_eq!(advice.prefetch_distance, Some(64));
1484 assert!(advice.memory_allocation_hint.is_some());
1485
1486 // Test Debug formatting
1487 assert!(!format!("{advice:?}").is_empty());
1488 }
1489
1490 #[test]
1491 fn test_benchmarking_config() {
1492 let config = benchmarking::BenchmarkConfig::default();
1493
1494 assert_eq!(config.warmup_iterations, 5);
1495 assert_eq!(config.measurement_iterations, 20);
1496 assert!(!config.sample_sizes.is_empty());
1497 assert!(!config.strategies.is_empty());
1498
1499 // Test preset configurations
1500 let array_config = benchmarking::presets::array_operations();
1501 assert_eq!(array_config.warmup_iterations, 3);
1502 assert_eq!(array_config.measurement_iterations, 10);
1503
1504 let matrix_config = benchmarking::presets::matrix_operations();
1505 assert_eq!(matrix_config.warmup_iterations, 5);
1506 assert_eq!(matrix_config.measurement_iterations, 15);
1507
1508 let memory_config = benchmarking::presets::memory_intensive();
1509 assert_eq!(memory_config.warmup_iterations, 2);
1510 assert_eq!(memory_config.measurement_iterations, 8);
1511 }
1512
1513 #[test]
1514 fn test_benchmark_measurement() {
1515 let measurement = benchmarking::BenchmarkMeasurement {
1516 duration: Duration::from_millis(5),
1517 strategy: OptimizationStrategy::Simd,
1518 input_size: 1000,
1519 throughput: 200_000.0,
1520 memory_usage: 8000,
1521 custom_metrics: std::collections::HashMap::new(),
1522 };
1523
1524 assert_eq!(measurement.strategy, OptimizationStrategy::Simd);
1525 assert_eq!(measurement.input_size, 1000);
1526 assert_eq!(measurement.throughput, 200_000.0);
1527 assert_eq!(measurement.memory_usage, 8000);
1528 }
1529
1530 #[test]
1531 fn test_benchmark_runner() {
1532 let config = benchmarking::BenchmarkConfig {
1533 warmup_iterations: 1,
1534 measurement_iterations: 2,
1535 min_duration: Duration::from_millis(1),
1536 max_duration: Duration::from_secs(1),
1537 sample_sizes: vec![10, 100],
1538 strategies: vec![OptimizationStrategy::Scalar, OptimizationStrategy::Simd]
1539 .into_iter()
1540 .collect(),
1541 };
1542
1543 let runner = benchmarking::BenchmarkRunner::new(config);
1544
1545 // Test a simple operation
1546 let results = runner.benchmark_operation("test_add", |data, _strategy| {
1547 let result: Vec<f64> = data.iter().map(|x| *x + 1.0).collect();
1548 (Duration::from_millis(1), result)
1549 });
1550
1551 assert!(!results.measurements.is_empty());
1552 }
1553
1554 #[test]
1555 fn test_strategy_performance() {
1556 let performance = benchmarking::StrategyPerformance {
1557 avg_throughput: 150_000.0,
1558 throughput_stddev: 5_000.0,
1559 avg_memory_usage: 8000.0,
1560 optimal_size: 10_000,
1561 efficiency_score: 0.85,
1562 };
1563
1564 assert_eq!(performance.avg_throughput, 150_000.0);
1565 assert_eq!(performance.throughput_stddev, 5_000.0);
1566 assert_eq!(performance.optimal_size, 10_000);
1567 assert_eq!(performance.efficiency_score, 0.85);
1568 }
1569
1570 #[test]
1571 fn test_scalability_analysis() {
1572 let mut parallel_efficiency = std::collections::HashMap::new();
1573 parallel_efficiency.insert(1000, 0.8);
1574 parallel_efficiency.insert(10000, 0.9);
1575
1576 let memory_scaling = benchmarking::MemoryScaling {
1577 linear_coefficient: 8.0,
1578 constant_coefficient: 1024.0,
1579 r_squared: 0.95,
1580 };
1581
1582 let bottleneck = benchmarking::PerformanceBottleneck {
1583 bottleneck_type: benchmarking::BottleneckType::MemoryBandwidth,
1584 size_range: (10000, 10000),
1585 impact: 0.3,
1586 mitigation: "Use memory prefetching".to_string(),
1587 };
1588
1589 let analysis = benchmarking::ScalabilityAnalysis {
1590 parallel_efficiency,
1591 memory_scaling,
1592 bottlenecks: vec![bottleneck],
1593 };
1594
1595 assert_eq!(analysis.parallel_efficiency[&1000], 0.8);
1596 assert_eq!(analysis.memory_scaling.linear_coefficient, 8.0);
1597 assert_eq!(analysis.bottlenecks.len(), 1);
1598 assert_eq!(
1599 analysis.bottlenecks[0].bottleneck_type,
1600 benchmarking::BottleneckType::MemoryBandwidth
1601 );
1602 }
1603
1604 #[test]
1605 fn test_memory_scaling() {
1606 let scaling = benchmarking::MemoryScaling {
1607 linear_coefficient: 8.0,
1608 constant_coefficient: 512.0,
1609 r_squared: 0.99,
1610 };
1611
1612 assert_eq!(scaling.linear_coefficient, 8.0);
1613 assert_eq!(scaling.constant_coefficient, 512.0);
1614 assert_eq!(scaling.r_squared, 0.99);
1615 }
1616
1617 #[test]
1618 fn test_performance_bottleneck() {
1619 let bottleneck = benchmarking::PerformanceBottleneck {
1620 bottleneck_type: benchmarking::BottleneckType::SynchronizationOverhead,
1621 size_range: (1000, 5000),
1622 impact: 0.6,
1623 mitigation: "Reduce thread contention".to_string(),
1624 };
1625
1626 assert_eq!(
1627 bottleneck.bottleneck_type,
1628 benchmarking::BottleneckType::SynchronizationOverhead
1629 );
1630 assert_eq!(bottleneck.size_range, (1000, 5000));
1631 assert_eq!(bottleneck.impact, 0.6);
1632 assert_eq!(bottleneck.mitigation, "Reduce thread contention");
1633 }
1634
1635 #[test]
1636 fn test_bottleneck_type_enum() {
1637 let bottleneck_types = [
1638 benchmarking::BottleneckType::MemoryBandwidth,
1639 benchmarking::BottleneckType::CacheLatency,
1640 benchmarking::BottleneckType::ComputeBound,
1641 benchmarking::BottleneckType::SynchronizationOverhead,
1642 benchmarking::BottleneckType::AlgorithmicComplexity,
1643 ];
1644
1645 for bottleneck_type in &bottleneck_types {
1646 // Test Debug formatting
1647 assert!(!format!("{bottleneck_type:?}").is_empty());
1648
1649 // Test equality
1650 assert_eq!(*bottleneck_type, *bottleneck_type);
1651 }
1652
1653 // Test inequality
1654 assert_ne!(
1655 benchmarking::BottleneckType::MemoryBandwidth,
1656 benchmarking::BottleneckType::CacheLatency
1657 );
1658 }
1659
1660 #[test]
1661 fn test_benchmark_results() {
1662 let measurement = benchmarking::BenchmarkMeasurement {
1663 strategy: OptimizationStrategy::Parallel,
1664 input_size: 1000,
1665 duration: Duration::from_millis(10),
1666 throughput: 100_000.0,
1667 memory_usage: 8000,
1668 custom_metrics: std::collections::HashMap::new(),
1669 };
1670
1671 let mut strategy_summary = std::collections::HashMap::new();
1672 strategy_summary.insert(
1673 OptimizationStrategy::Parallel,
1674 benchmarking::StrategyPerformance {
1675 avg_throughput: 100_000.0,
1676 throughput_stddev: 1_000.0,
1677 avg_memory_usage: 8000.0,
1678 optimal_size: 1000,
1679 efficiency_score: 0.9,
1680 },
1681 );
1682
1683 let scalability_analysis = benchmarking::ScalabilityAnalysis {
1684 parallel_efficiency: std::collections::HashMap::new(),
1685 memory_scaling: benchmarking::MemoryScaling {
1686 linear_coefficient: 8.0,
1687 constant_coefficient: 0.0,
1688 r_squared: 1.0,
1689 },
1690 bottlenecks: Vec::new(),
1691 };
1692
1693 let results = benchmarking::BenchmarkResults {
1694 operation_name: "test_operation".to_string(),
1695 measurements: vec![measurement],
1696 strategy_summary,
1697 scalability_analysis,
1698 recommendations: vec!["Use parallel strategy".to_string()],
1699 total_duration: Duration::from_millis(100),
1700 };
1701
1702 assert_eq!(results.operation_name, "test_operation");
1703 assert_eq!(results.measurements.len(), 1);
1704 assert_eq!(results.strategy_summary.len(), 1);
1705 assert_eq!(results.recommendations.len(), 1);
1706 assert_eq!(results.total_duration, Duration::from_millis(100));
1707 }
1708
1709 #[test]
1710 fn test_modern_architecture_detection() {
1711 // Test architecture detection functions (these will return results based on actual hardware)
1712 let zen4_detected = is_zen4_or_newer();
1713 let golden_cove_detected = is_intel_golden_cove_or_newer();
1714 let apple_silicon_detected = is_apple_silicon();
1715 let neoverse_detected = is_neoverse_or_newer();
1716
1717 // These tests will pass as they just check the functions don't panic
1718 // Test passes if no panic occurs above
1719 }
1720
1721 #[test]
1722 fn test_enhanced_strategy_selector() {
1723 let selector = StrategySelector::default();
1724
1725 // Test that new strategies are included in default weights
1726 assert!(selector
1727 .strategy_weights
1728 .contains_key(&OptimizationStrategy::ModernArchOptimized));
1729 assert!(selector
1730 .strategy_weights
1731 .contains_key(&OptimizationStrategy::VectorOptimized));
1732 assert!(selector
1733 .strategy_weights
1734 .contains_key(&OptimizationStrategy::EnergyEfficient));
1735 assert!(selector
1736 .strategy_weights
1737 .contains_key(&OptimizationStrategy::HighThroughput));
1738
1739 // Test that ModernArchOptimized has higher initial weight
1740 let modern_weight = selector
1741 .strategy_weights
1742 .get(&OptimizationStrategy::ModernArchOptimized)
1743 .expect("Operation failed");
1744 let scalar_weight = selector
1745 .strategy_weights
1746 .get(&OptimizationStrategy::Scalar)
1747 .expect("Operation failed");
1748 assert!(modern_weight > scalar_weight);
1749 }
1750
1751 #[test]
1752 fn test_enhanced_strategy_selection() {
1753 let selector = StrategySelector::default();
1754
1755 // Test small operation strategy selection
1756 let small_strategy = selector.select_strategy(100, false);
1757 assert!(matches!(
1758 small_strategy,
1759 OptimizationStrategy::Scalar
1760 | OptimizationStrategy::EnergyEfficient
1761 | OptimizationStrategy::ModernArchOptimized
1762 ));
1763
1764 // Test large operation strategy selection
1765 let large_strategy = selector.select_strategy(1_000_000, false);
1766 assert!(matches!(
1767 large_strategy,
1768 OptimizationStrategy::HighThroughput
1769 | OptimizationStrategy::VectorOptimized
1770 | OptimizationStrategy::Parallel
1771 ));
1772
1773 // Test memory-bound operation strategy selection
1774 let memory_bound_strategy = selector.select_strategy(10_000, true);
1775 assert!(matches!(
1776 memory_bound_strategy,
1777 OptimizationStrategy::MemoryBound | OptimizationStrategy::ModernArchOptimized
1778 ));
1779 }
1780
1781 #[test]
1782 #[cfg(feature = "benchmarking")]
1783 fn test_advanced_benchmark_config() {
1784 let config = benchmarking::presets::advanced_comprehensive();
1785
1786 // Verify comprehensive strategy coverage
1787 assert!(config
1788 .strategies
1789 .contains(&OptimizationStrategy::ModernArchOptimized));
1790 assert!(config
1791 .strategies
1792 .contains(&OptimizationStrategy::VectorOptimized));
1793 assert!(config
1794 .strategies
1795 .contains(&OptimizationStrategy::EnergyEfficient));
1796 assert!(config
1797 .strategies
1798 .contains(&OptimizationStrategy::HighThroughput));
1799
1800 // Verify comprehensive size coverage
1801 assert!(config.sample_sizes.len() >= 10);
1802 assert!(config.sample_sizes.contains(&100));
1803 assert!(config.sample_sizes.contains(&5_000_000));
1804
1805 // Verify thorough measurement configuration
1806 assert!(config.measurement_iterations >= 25);
1807 assert!(config.warmup_iterations >= 10);
1808 }
1809
1810 #[test]
1811 #[cfg(feature = "benchmarking")]
1812 fn test_modern_architecture_benchmark_config() {
1813 let config = benchmarking::presets::modern_architectures();
1814
1815 // Verify focus on modern strategies
1816 assert_eq!(config.strategies.len(), 4);
1817 assert!(config
1818 .strategies
1819 .contains(&OptimizationStrategy::ModernArchOptimized));
1820 assert!(config
1821 .strategies
1822 .contains(&OptimizationStrategy::VectorOptimized));
1823 assert!(config
1824 .strategies
1825 .contains(&OptimizationStrategy::HighThroughput));
1826 assert!(config
1827 .strategies
1828 .contains(&OptimizationStrategy::EnergyEfficient));
1829
1830 // Should not contain basic strategies for focused testing
1831 assert!(!config.strategies.contains(&OptimizationStrategy::Scalar));
1832 }
1833
1834 #[test]
1835 fn test_enhanced_cache_line_detection() {
1836 let optimizer = AdaptiveOptimizer::new();
1837 let cache_line_size = optimizer.cache_line_size;
1838
1839 // Cache line size should be reasonable (typically 64 or 128 bytes)
1840 assert!(cache_line_size == 64 || cache_line_size == 128);
1841
1842 // Should be power of 2
1843 assert_eq!(cache_line_size & (cache_line_size - 1), 0);
1844 }
1845
1846 #[test]
1847 fn test_strategy_weight_updates() {
1848 let mut selector = StrategySelector::default();
1849 let initial_weight = *selector
1850 .strategy_weights
1851 .get(&OptimizationStrategy::ModernArchOptimized)
1852 .expect("Operation failed");
1853
1854 // Update with good performance score
1855 selector.update_weights(OptimizationStrategy::ModernArchOptimized, 0.9);
1856 let updated_weight = *selector
1857 .strategy_weights
1858 .get(&OptimizationStrategy::ModernArchOptimized)
1859 .expect("Operation failed");
1860
1861 // Weight should have been adjusted based on learning
1862 assert_ne!(initial_weight, updated_weight);
1863 }
1864}
1865*/