1use std::sync::atomic::{AtomicUsize, Ordering};
17
18#[allow(dead_code)]
20#[derive(Debug, Clone, Copy, PartialEq, Eq)]
21pub enum Locality {
22 High,
24 Medium,
26 Low,
28 None,
30}
31
32pub struct PerformanceHints;
34
35impl PerformanceHints {
36 #[inline(always)]
41 pub fn likely(cond: bool) -> bool {
42 #[cfg(target_arch = "x86_64")]
44 {
45 if cond {
46 unsafe {
48 std::arch::asm!("# likely branch", options(nomem, nostack));
49 }
50 }
51 }
52 cond
53 }
54
55 #[inline(always)]
60 pub fn unlikely(cond: bool) -> bool {
61 #[cfg(target_arch = "x86_64")]
63 {
64 if !cond {
65 unsafe {
67 std::arch::asm!("# unlikely branch", options(nomem, nostack));
68 }
69 }
70 }
71 cond
72 }
73
74 #[inline(always)]
76 pub fn prefetch_read<T>(data: &T) {
77 let ptr = data as *const T as *const u8;
78
79 #[cfg(target_arch = "x86_64")]
80 {
81 unsafe {
82 std::arch::asm!(
84 "prefetcht0 [{}]",
85 in(reg) ptr,
86 options(readonly, nostack)
87 );
88 }
89 }
90 #[cfg(target_arch = "aarch64")]
91 {
92 unsafe {
93 std::arch::asm!(
95 "prfm pldl1keep, [{}]",
96 in(reg) ptr,
97 options(readonly, nostack)
98 );
99 }
100 }
101 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
102 {
103 std::hint::black_box(data);
105 }
106 }
107
108 #[inline(always)]
110 pub fn prefetch_write<T>(data: &mut T) {
111 let ptr = data as *mut T as *mut u8;
112
113 #[cfg(target_arch = "x86_64")]
114 {
115 unsafe {
116 std::arch::asm!(
118 "prefetcht0 [{}]",
119 in(reg) ptr,
120 options(nostack)
121 );
122 }
123 }
124 #[cfg(target_arch = "aarch64")]
125 {
126 unsafe {
127 std::arch::asm!(
129 "prfm pstl1keep, [{}]",
130 in(reg) ptr,
131 options(nostack)
132 );
133 }
134 }
135 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
136 {
137 std::hint::black_box(data);
139 }
140 }
141
142 #[inline(always)]
144 pub fn prefetch_with_locality<T>(data: &T, locality: Locality) {
145 let ptr = data as *const T as *const u8;
146
147 #[cfg(target_arch = "x86_64")]
148 {
149 unsafe {
150 match locality {
151 Locality::High => {
152 std::arch::asm!(
154 "prefetcht0 [{}]",
155 in(reg) ptr,
156 options(readonly, nostack)
157 );
158 }
159 Locality::Medium => {
160 std::arch::asm!(
162 "prefetcht1 [{}]",
163 in(reg) ptr,
164 options(readonly, nostack)
165 );
166 }
167 Locality::Low => {
168 std::arch::asm!(
170 "prefetcht2 [{}]",
171 in(reg) ptr,
172 options(readonly, nostack)
173 );
174 }
175 Locality::None => {
176 std::arch::asm!(
178 "prefetchnta [{}]",
179 in(reg) ptr,
180 options(readonly, nostack)
181 );
182 }
183 }
184 }
185 }
186 #[cfg(target_arch = "aarch64")]
187 {
188 unsafe {
189 match locality {
190 Locality::High => {
191 std::arch::asm!(
192 "prfm pldl1keep, [{}]",
193 in(reg) ptr,
194 options(readonly, nostack)
195 );
196 }
197 Locality::Medium => {
198 std::arch::asm!(
199 "prfm pldl2keep, [{}]",
200 in(reg) ptr,
201 options(readonly, nostack)
202 );
203 }
204 Locality::Low => {
205 std::arch::asm!(
206 "prfm pldl3keep, [{}]",
207 in(reg) ptr,
208 options(readonly, nostack)
209 );
210 }
211 Locality::None => {
212 std::arch::asm!(
213 "prfm pldl1strm, [{}]",
214 in(reg) ptr,
215 options(readonly, nostack)
216 );
217 }
218 }
219 }
220 }
221 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
222 {
223 std::hint::black_box(data);
224 }
225 }
226
227 #[inline(always)]
229 pub fn memory_fence() {
230 #[cfg(target_arch = "x86_64")]
231 {
232 unsafe {
233 std::arch::asm!("mfence", options(nostack));
234 }
235 }
236 #[cfg(target_arch = "aarch64")]
237 {
238 unsafe {
239 std::arch::asm!("dmb sy", options(nostack));
240 }
241 }
242 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
243 {
244 std::sync::atomic::fence(std::sync::atomic::Ordering::SeqCst);
245 }
246 }
247
248 #[inline(always)]
250 pub fn flush_cache_line<T>(data: &T) {
251 let ptr = data as *const T as *const u8;
252
253 #[cfg(target_arch = "x86_64")]
256 {
257 unsafe {
260 std::arch::asm!("mfence", options(nostack, nomem));
261 }
262 }
263 #[cfg(target_arch = "aarch64")]
264 {
265 unsafe {
266 std::arch::asm!(
268 "dc civac, {}",
269 in(reg) ptr,
270 options(nostack)
271 );
272 }
273 }
274 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
275 {
276 std::hint::black_box(data);
278 }
279 }
280
281 #[inline]
283 pub fn cache_aware_copy<T: Copy>(src: &[T], dst: &mut [T]) {
284 assert_eq!(src.len(), dst.len());
285
286 if std::mem::size_of_val(src) > 64 * 1024 {
287 #[cfg(target_arch = "x86_64")]
289 {
290 unsafe {
291 let src_ptr = src.as_ptr() as *const u8;
292 let dst_ptr = dst.as_mut_ptr() as *mut u8;
293 let len = std::mem::size_of_val(src);
294
295 std::ptr::copy_nonoverlapping(src_ptr, dst_ptr, len);
297
298 std::arch::asm!("sfence", options(nostack));
300 }
301 return;
302 }
303 }
304
305 dst.copy_from_slice(src);
307 }
308
309 #[inline]
311 pub fn cache_aware_memset<T: Copy>(dst: &mut [T], value: T) {
312 if std::mem::size_of_val(dst) > 32 * 1024 {
313 #[cfg(all(feature = "simd", target_arch = "x86_64"))]
315 {
316 if std::mem::size_of::<T>() == 8 {
318 let chunks = dst.len() / 2;
320 for i in 0..chunks {
321 dst[i * 2] = value;
322 dst[i * 2 + 1] = value;
323 }
324 for item in dst.iter_mut().skip(chunks * 2) {
326 *item = value;
327 }
328 return;
329 }
330 }
331 }
332
333 dst.fill(value);
335 }
336}
337
338#[allow(dead_code)]
340#[derive(Debug, Clone)]
341pub struct PerformanceMetrics {
342 pub operation_times: std::collections::HashMap<String, f64>,
344 pub strategy_success_rates: std::collections::HashMap<OptimizationStrategy, f64>,
346 pub memorybandwidth_utilization: f64,
348 pub cache_hit_rate: f64,
350 pub parallel_efficiency: f64,
352}
353
354impl Default for PerformanceMetrics {
355 fn default() -> Self {
356 Self {
357 operation_times: std::collections::HashMap::new(),
358 strategy_success_rates: std::collections::HashMap::new(),
359 memorybandwidth_utilization: 0.0,
360 cache_hit_rate: 0.0,
361 parallel_efficiency: 0.0,
362 }
363 }
364}
365
366#[allow(dead_code)]
368#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
369pub enum OptimizationStrategy {
370 Scalar,
371 Simd,
372 Parallel,
373 Gpu,
374 Hybrid,
375 CacheOptimized,
376 MemoryBound,
377 ComputeBound,
378 ModernArchOptimized,
380 VectorOptimized,
382 EnergyEfficient,
384 HighThroughput,
386}
387
388#[allow(dead_code)]
390#[derive(Debug, Clone)]
391pub struct StrategySelector {
392 #[allow(dead_code)]
394 preferred_strategy: OptimizationStrategy,
395 strategy_weights: std::collections::HashMap<OptimizationStrategy, f64>,
397 learningrate: f64,
399 exploration_rate: f64,
401}
402
403impl Default for StrategySelector {
404 fn default() -> Self {
405 let mut strategy_weights = std::collections::HashMap::new();
406 strategy_weights.insert(OptimizationStrategy::Scalar, 1.0);
407 strategy_weights.insert(OptimizationStrategy::Simd, 1.0);
408 strategy_weights.insert(OptimizationStrategy::Parallel, 1.0);
409 strategy_weights.insert(OptimizationStrategy::Gpu, 1.0);
410 strategy_weights.insert(OptimizationStrategy::Hybrid, 1.0);
411 strategy_weights.insert(OptimizationStrategy::CacheOptimized, 1.0);
412 strategy_weights.insert(OptimizationStrategy::MemoryBound, 1.0);
413 strategy_weights.insert(OptimizationStrategy::ComputeBound, 1.0);
414 strategy_weights.insert(OptimizationStrategy::ModernArchOptimized, 1.5); strategy_weights.insert(OptimizationStrategy::VectorOptimized, 1.3);
416 strategy_weights.insert(OptimizationStrategy::EnergyEfficient, 1.0);
417 strategy_weights.insert(OptimizationStrategy::HighThroughput, 1.2);
418
419 Self {
420 preferred_strategy: OptimizationStrategy::ModernArchOptimized,
421 strategy_weights,
422 learningrate: 0.1,
423 exploration_rate: 0.1,
424 }
425 }
426}
427
428impl StrategySelector {
429 pub fn select_strategy(
431 &self,
432 operation_size: usize,
433 is_memory_bound: bool,
434 ) -> OptimizationStrategy {
435 use std::collections::hash_map::DefaultHasher;
437 use std::hash::{Hash, Hasher};
438
439 let mut hasher = DefaultHasher::new();
440 operation_size.hash(&mut hasher);
441 let rand_val = (hasher.finish() % 100) as f64 / 100.0;
442
443 if rand_val < self.exploration_rate {
444 let strategies = [
446 OptimizationStrategy::Scalar,
447 OptimizationStrategy::Simd,
448 OptimizationStrategy::Parallel,
449 OptimizationStrategy::Gpu,
450 OptimizationStrategy::ModernArchOptimized,
451 OptimizationStrategy::VectorOptimized,
452 OptimizationStrategy::EnergyEfficient,
453 OptimizationStrategy::HighThroughput,
454 ];
455 strategies[operation_size % strategies.len()]
456 } else {
457 if is_memory_bound {
459 if is_apple_silicon() || is_neoverse_or_newer() {
461 OptimizationStrategy::ModernArchOptimized
462 } else {
463 OptimizationStrategy::MemoryBound
464 }
465 } else if operation_size > 1_000_000 {
466 OptimizationStrategy::HighThroughput
468 } else if operation_size > 100_000 {
469 if is_zen4_or_newer() || is_intel_golden_cove_or_newer() {
471 OptimizationStrategy::VectorOptimized
472 } else {
473 OptimizationStrategy::Parallel
474 }
475 } else if operation_size > 1_000 {
476 if is_zen4_or_newer() || is_apple_silicon() {
478 OptimizationStrategy::ModernArchOptimized
479 } else {
480 OptimizationStrategy::Simd
481 }
482 } else {
483 if cfg!(target_os = "android") || cfg!(target_os = "ios") {
485 OptimizationStrategy::EnergyEfficient
486 } else {
487 OptimizationStrategy::Scalar
488 }
489 }
490 }
491 }
492
493 pub fn update_weights(&mut self, strategy: OptimizationStrategy, performancescore: f64) {
495 if let Some(weight) = self.strategy_weights.get_mut(&strategy) {
496 *weight = *weight * (1.0 - self.learningrate) + performancescore * self.learningrate;
497 }
498 }
499
500 #[allow(dead_code)]
502 fn is_neoverse_or_newer() -> bool {
503 crate::performance_optimization::is_neoverse_or_newer()
504 }
505
506 #[allow(dead_code)]
508 fn is_zen4_or_newer() -> bool {
509 crate::performance_optimization::is_zen4_or_newer()
510 }
511
512 #[allow(dead_code)]
514 fn is_intel_golden_cove_or_newer() -> bool {
515 crate::performance_optimization::is_intel_golden_cove_or_newer()
516 }
517}
518
519#[allow(dead_code)]
521fn is_zen4_or_newer() -> bool {
522 #[cfg(target_arch = "x86_64")]
523 {
524 is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl")
526 }
527 #[cfg(not(target_arch = "x86_64"))]
528 {
529 false
530 }
531}
532
533#[allow(dead_code)]
535fn is_intel_golden_cove_or_newer() -> bool {
536 #[cfg(target_arch = "x86_64")]
537 {
538 is_x86_feature_detected!("avx2")
540 && is_x86_feature_detected!("fma")
541 && is_x86_feature_detected!("bmi2")
542 }
543 #[cfg(not(target_arch = "x86_64"))]
544 {
545 false
546 }
547}
548
549#[allow(dead_code)]
551fn is_apple_silicon() -> bool {
552 #[cfg(target_arch = "aarch64")]
553 {
554 cfg!(target_vendor = "apple")
556 }
557 #[cfg(not(target_arch = "aarch64"))]
558 {
559 false
560 }
561}
562
563#[allow(dead_code)]
565fn is_neoverse_or_newer() -> bool {
566 #[cfg(target_arch = "aarch64")]
567 {
568 std::arch::is_aarch64_feature_detected!("asimd")
570 && std::arch::is_aarch64_feature_detected!("crc")
571 && std::arch::is_aarch64_feature_detected!("fp")
572 }
573 #[cfg(not(target_arch = "aarch64"))]
574 {
575 false
576 }
577}
578
579pub struct AdaptiveOptimizer {
581 parallel_threshold: AtomicUsize,
583 simd_threshold: AtomicUsize,
585 #[allow(dead_code)]
587 gpu_threshold: AtomicUsize,
588 cache_line_size: usize,
590 performance_metrics: std::sync::RwLock<PerformanceMetrics>,
592 strategy_selector: std::sync::RwLock<StrategySelector>,
594}
595
596impl AdaptiveOptimizer {
597 pub fn new() -> Self {
599 Self {
600 parallel_threshold: AtomicUsize::new(10_000),
601 simd_threshold: AtomicUsize::new(1_000),
602 gpu_threshold: AtomicUsize::new(100_000),
603 cache_line_size: Self::detect_cache_line_size(),
604 performance_metrics: std::sync::RwLock::new(PerformanceMetrics::default()),
605 strategy_selector: std::sync::RwLock::new(StrategySelector::default()),
606 }
607 }
608
609 fn detect_cache_line_size() -> usize {
611 #[cfg(target_arch = "x86_64")]
612 {
613 64
615 }
616 #[cfg(target_arch = "aarch64")]
617 {
618 if is_apple_silicon() {
620 128 } else if is_neoverse_or_newer() {
622 128 } else {
624 128 }
626 }
627 #[cfg(target_arch = "riscv64")]
628 {
629 64 }
631 #[cfg(not(any(
632 target_arch = "x86_64",
633 target_arch = "aarch64",
634 target_arch = "riscv64"
635 )))]
636 {
637 64 }
639 }
640
641 #[inline]
643 #[allow(unused_variables)]
644 pub fn should_use_parallel(&self, size: usize) -> bool {
645 #[cfg(feature = "parallel")]
646 {
647 size >= self.parallel_threshold.load(Ordering::Relaxed)
648 }
649 #[cfg(not(feature = "parallel"))]
650 {
651 false
652 }
653 }
654
655 #[inline]
657 #[allow(unused_variables)]
658 pub fn should_use_simd(&self, size: usize) -> bool {
659 #[cfg(feature = "simd")]
660 {
661 size >= self.simd_threshold.load(Ordering::Relaxed)
662 }
663 #[cfg(not(feature = "simd"))]
664 {
665 false
666 }
667 }
668
669 pub fn update_from_measurement(&mut self, operation: &str, size: usize, durationns: u64) {
671 let ops_per_ns = size as f64 / durationns as f64;
673
674 if operation.contains("parallel") && ops_per_ns < 0.1 {
675 self.parallel_threshold
677 .fetch_add(size / 10, Ordering::Relaxed);
678 } else if operation.contains("simd") && ops_per_ns < 1.0 {
679 self.simd_threshold.fetch_add(size / 10, Ordering::Relaxed);
681 }
682 }
683
684 #[inline]
686 pub fn optimal_chunk_size<T>(&self) -> usize {
687 let element_size = std::mem::size_of::<T>();
689 let elements_per_cache_line = self.cache_line_size / element_size.max(1);
690
691 elements_per_cache_line * 16
693 }
694
695 #[inline]
697 #[allow(unused_variables)]
698 pub fn should_use_gpu(&self, size: usize) -> bool {
699 #[cfg(feature = "gpu")]
700 {
701 size >= self.gpu_threshold.load(Ordering::Relaxed)
702 }
703 #[cfg(not(feature = "gpu"))]
704 {
705 false
706 }
707 }
708
709 pub fn select_for_operation(&self, operationname: &str, size: usize) -> OptimizationStrategy {
711 let memory_bound = operationname.contains("copy")
713 || operationname.contains("memset")
714 || operationname.contains("transpose");
715
716 if let Ok(selector) = self.strategy_selector.read() {
717 selector.select_strategy(size, memory_bound)
718 } else {
719 if self.should_use_gpu(size) {
721 OptimizationStrategy::Gpu
722 } else if self.should_use_parallel(size) {
723 OptimizationStrategy::Parallel
724 } else if self.should_use_simd(size) {
725 OptimizationStrategy::Simd
726 } else {
727 OptimizationStrategy::Scalar
728 }
729 }
730 }
731
732 pub fn record_performance(
734 &mut self,
735 operation: &str,
736 size: usize,
737 strategy: OptimizationStrategy,
738 duration_ns: u64,
739 ) {
740 let ops_per_ns = size as f64 / duration_ns as f64;
742 let performance_score = ops_per_ns.min(10.0) / 10.0; if let Ok(mut selector) = self.strategy_selector.write() {
746 selector.update_weights(strategy, performance_score);
747 }
748
749 if let Ok(mut metrics) = self.performance_metrics.write() {
751 let avg_time = metrics
752 .operation_times
753 .entry(operation.to_string())
754 .or_insert(0.0);
755 *avg_time = (*avg_time * 0.9) + (duration_ns as f64 * 0.1); metrics
758 .strategy_success_rates
759 .insert(strategy, performance_score);
760 }
761
762 }
765
766 pub fn get_performance_metrics(&self) -> Option<PerformanceMetrics> {
768 self.performance_metrics.read().ok().map(|m| m.clone())
769 }
770
771 pub fn analyze_operation(&self, operation_name: &str, inputsize: usize) -> OptimizationAdvice {
773 let strategy = self.select_optimal_strategy(operation_name, inputsize);
774 let chunk_size = if strategy == OptimizationStrategy::Parallel {
775 Some(self.optimal_chunk_size::<f64>())
776 } else {
777 None
778 };
779
780 let prefetch_distance = if inputsize > 10_000 {
781 Some(self.cache_line_size * 8) } else {
783 None
784 };
785
786 OptimizationAdvice {
787 recommended_strategy: strategy,
788 optimal_chunk_size: chunk_size,
789 prefetch_distance,
790 memory_allocation_hint: if inputsize > 1_000_000 {
791 Some("Consider using memory-mapped files for large outputs".to_string())
792 } else {
793 None
794 },
795 }
796 }
797
798 #[allow(dead_code)]
800 fn is_zen4_or_newer() -> bool {
801 crate::performance_optimization::is_zen4_or_newer()
802 }
803
804 #[allow(dead_code)]
806 fn is_intel_golden_cove_or_newer() -> bool {
807 crate::performance_optimization::is_intel_golden_cove_or_newer()
808 }
809
810 pub fn select_optimal_strategy(
812 &self,
813 _operation_name: &str,
814 input_size: usize,
815 ) -> OptimizationStrategy {
816 if input_size >= self.gpu_threshold.load(Ordering::Relaxed) && self.has_gpu_support() {
818 return OptimizationStrategy::Gpu;
819 }
820
821 if input_size >= self.parallel_threshold.load(Ordering::Relaxed) {
823 return OptimizationStrategy::Parallel;
824 }
825
826 if input_size >= self.simd_threshold.load(Ordering::Relaxed) && self.has_simd_support() {
828 return OptimizationStrategy::Simd;
829 }
830
831 OptimizationStrategy::Scalar
833 }
834
835 pub fn has_gpu_support(&self) -> bool {
837 false
839 }
840
841 pub fn has_simd_support(&self) -> bool {
843 #[cfg(target_arch = "x86_64")]
845 {
846 std::arch::is_x86_feature_detected!("avx2")
847 || std::arch::is_x86_feature_detected!("sse4.1")
848 }
849 #[cfg(target_arch = "aarch64")]
850 {
851 std::arch::is_aarch64_feature_detected!("neon")
852 }
853 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
854 {
855 false
856 }
857 }
858}
859
860#[allow(dead_code)]
862#[derive(Debug, Clone)]
863pub struct OptimizationAdvice {
864 pub recommended_strategy: OptimizationStrategy,
866 pub optimal_chunk_size: Option<usize>,
868 pub prefetch_distance: Option<usize>,
870 pub memory_allocation_hint: Option<String>,
872}
873
874impl Default for AdaptiveOptimizer {
875 fn default() -> Self {
876 Self::new()
877 }
878}
879
880pub mod fast_paths {
882 use super::*;
883
884 #[inline]
886 #[allow(unused_variables)]
887 pub fn add_f64_arrays(a: &[f64], b: &[f64], result: &mut [f64]) -> Result<(), &'static str> {
888 if a.len() != b.len() || a.len() != result.len() {
889 return Err("Array lengths must match");
890 }
891
892 let len = a.len();
893 let optimizer = AdaptiveOptimizer::new();
894
895 #[cfg(feature = "simd")]
896 if optimizer.should_use_simd(len) {
897 use crate::simd_ops::SimdUnifiedOps;
899 use ndarray::ArrayView1;
900
901 let simd_chunks = len / 4; for i in 0..simd_chunks {
905 let start = i * 4;
906 let end = start + 4;
907
908 if end <= len {
909 let a_view = ArrayView1::from(&a[start..end]);
910 let b_view = ArrayView1::from(&b[start..end]);
911
912 let simd_result = f64::simd_add(&a_view, &b_view);
914 result[start..end].copy_from_slice(simd_result.as_slice().unwrap());
915 }
916 }
917
918 for i in (simd_chunks * 4)..len {
920 result[0] = a[0] + b[0];
921 }
922 return Ok(());
923 }
924
925 #[cfg(feature = "parallel")]
926 if optimizer.should_use_parallel(len) {
927 use crate::parallel_ops::*;
928 result
929 .par_chunks_mut(optimizer.optimal_chunk_size::<f64>())
930 .zip(a.par_chunks(optimizer.optimal_chunk_size::<f64>()))
931 .zip(b.par_chunks(optimizer.optimal_chunk_size::<f64>()))
932 .for_each(|((r_chunk, a_chunk), b_chunk)| {
933 for i in 0..r_chunk.len() {
934 r_chunk[0] = a_chunk[0] + b_chunk[0];
935 }
936 });
937 return Ok(());
938 }
939
940 let chunks = len / 8;
942
943 for i in 0..chunks {
944 let idx = i * 8;
945 result[idx] = a[idx] + b[idx];
946 result[idx + 1] = a[idx + 1] + b[idx + 1];
947 result[idx + 2] = a[idx + 2] + b[idx + 2];
948 result[idx + 3] = a[idx + 3] + b[idx + 3];
949 result[idx + 4] = a[idx + 4] + b[idx + 4];
950 result[idx + 5] = a[idx + 5] + b[idx + 5];
951 result[idx + 6] = a[idx + 6] + b[idx + 6];
952 result[idx + 7] = a[idx + 7] + b[idx + 7];
953 }
954
955 for i in (chunks * 8)..len {
956 result[0] = a[0] + b[0];
957 }
958
959 Ok(())
960 }
961
962 #[inline]
964 pub fn matmul_kernel(
965 a: &[f64],
966 b: &[f64],
967 c: &mut [f64],
968 m: usize,
969 k: usize,
970 n: usize,
971 ) -> Result<(), &'static str> {
972 if a.len() != m * k || b.len() != k * n || c.len() != m * n {
973 return Err("Invalid matrix dimensions");
974 }
975
976 const TILE_M: usize = 64;
978 const TILE_N: usize = 64;
979 const TILE_K: usize = 64;
980
981 c.fill(0.0);
983
984 #[cfg(feature = "parallel")]
985 {
986 let optimizer = AdaptiveOptimizer::new();
987 if optimizer.should_use_parallel(m * n) {
988 use crate::parallel_ops::*;
989
990 use std::sync::Mutex;
992 let c_mutex = Mutex::new(c);
993
994 (0..m).into_par_iter().step_by(TILE_M).for_each(|i0| {
996 let i_max = (i0 + TILE_M).min(m);
997 let mut local_updates = Vec::new();
998
999 for j0 in (0..n).step_by(TILE_N) {
1000 for k0 in (0..k).step_by(TILE_K) {
1001 let j_max = (j0 + TILE_N).min(n);
1002 let k_max = (k0 + TILE_K).min(k);
1003
1004 for i in i0..i_max {
1005 for j in j0..j_max {
1006 let mut sum = 0.0;
1007 for k_idx in k0..k_max {
1008 sum += a[i * k + k_idx] * b[k_idx * n + j];
1009 }
1010 local_updates.push((i, j, sum));
1011 }
1012 }
1013 }
1014 }
1015
1016 if let Ok(mut c_guard) = c_mutex.lock() {
1018 for (i, j, sum) in local_updates {
1019 c_guard[i * n + j] += sum;
1020 }
1021 }
1022 });
1023 return Ok(());
1024 }
1025 }
1026
1027 for i0 in (0..m).step_by(TILE_M) {
1029 for j0 in (0..n).step_by(TILE_N) {
1030 for k0 in (0..k).step_by(TILE_K) {
1031 let i_max = (i0 + TILE_M).min(m);
1032 let j_max = (j0 + TILE_N).min(n);
1033 let k_max = (k0 + TILE_K).min(k);
1034
1035 for i in i0..i_max {
1036 for j in j0..j_max {
1037 let mut sum = c[i * n + j];
1038 for k_idx in k0..k_max {
1039 sum += a[i * k + k_idx] * b[k_idx * n + j];
1040 }
1041 c[i * n + j] = sum;
1042 }
1043 }
1044 }
1045 }
1046 }
1047
1048 Ok(())
1049 }
1050}
1051
1052#[allow(dead_code)]
1054pub struct MemoryAccessOptimizer {
1055 stride_detector: StrideDetector,
1057}
1058
1059#[derive(Default)]
1060#[allow(dead_code)]
1061struct StrideDetector {
1062 last_address: Option<usize>,
1063 detected_stride: Option<isize>,
1064 confidence: f32,
1065}
1066
1067impl MemoryAccessOptimizer {
1068 pub fn new() -> Self {
1069 Self {
1070 stride_detector: StrideDetector::default(),
1071 }
1072 }
1073
1074 pub fn analyze_access_pattern<T>(&mut self, addresses: &[*const T]) -> AccessPattern {
1076 if addresses.is_empty() {
1077 return AccessPattern::Unknown;
1078 }
1079
1080 let mut strides = Vec::new();
1082 for window in addresses.windows(2) {
1083 let stride = (window[1] as isize) - (window[0] as isize);
1084 strides.push(stride / std::mem::size_of::<T>() as isize);
1085 }
1086
1087 if strides.windows(2).all(|w| w[0] == w[1]) {
1089 match strides[0] {
1090 1 => AccessPattern::Sequential,
1091 -1 => AccessPattern::ReverseSequential,
1092 s if s > 1 => AccessPattern::Strided(s as usize),
1093 _ => AccessPattern::Random,
1094 }
1095 } else {
1096 AccessPattern::Random
1097 }
1098 }
1099}
1100
1101#[allow(dead_code)]
1102#[derive(Debug, Clone, Copy, PartialEq)]
1103pub enum AccessPattern {
1104 Sequential,
1105 ReverseSequential,
1106 Strided(usize),
1107 Random,
1108 Unknown,
1109}
1110
1111impl Default for MemoryAccessOptimizer {
1112 fn default() -> Self {
1113 Self::new()
1114 }
1115}
1116
1117pub use crate::performance::benchmarking;
1119
1120pub use crate::performance::cache_optimization as cache_aware_algorithms;
1126
1127pub use crate::performance::advanced_optimization;
1129
1130#[cfg(test)]
1131mod tests {
1132 use super::*;
1133 use std::time::Duration;
1134
1135 #[cfg(feature = "benchmarking")]
1136 use crate::benchmarking;
1137
1138 #[test]
1139 fn test_adaptive_optimizer() {
1140 let optimizer = AdaptiveOptimizer::new();
1141
1142 assert!(!optimizer.should_use_parallel(100));
1144
1145 #[cfg(feature = "parallel")]
1147 assert!(optimizer.should_use_parallel(100_000));
1148
1149 let chunk_size = optimizer.optimal_chunk_size::<f64>();
1151 assert!(chunk_size > 0);
1152 assert_eq!(chunk_size % 16, 0); }
1154
1155 #[test]
1156 fn test_fast_path_addition() {
1157 let a = vec![1.0; 32];
1158 let b = vec![2.0; 32];
1159 let mut result = vec![0.0; 32];
1160
1161 fast_paths::add_f64_arrays(&a, &b, &mut result).unwrap();
1162
1163 for val in result {
1164 assert_eq!(val, 3.0);
1165 }
1166 }
1167
1168 #[test]
1169 fn test_memory_access_pattern() {
1170 let mut optimizer = MemoryAccessOptimizer::new();
1171
1172 let addresses: Vec<*const f64> = (0..10)
1174 .map(|i| (i * std::mem::size_of::<f64>()) as *const f64)
1175 .collect();
1176 assert_eq!(
1177 optimizer.analyze_access_pattern(&addresses),
1178 AccessPattern::Sequential
1179 );
1180
1181 let addresses: Vec<*const f64> = (0..10)
1183 .map(|i| (i * 3 * std::mem::size_of::<f64>()) as *const f64)
1184 .collect();
1185 assert_eq!(
1186 optimizer.analyze_access_pattern(&addresses),
1187 AccessPattern::Strided(3)
1188 );
1189 }
1190
1191 #[test]
1192 fn test_performance_hints() {
1193 assert!(PerformanceHints::likely(true));
1195 assert!(!PerformanceHints::likely(false));
1196 assert!(PerformanceHints::unlikely(true));
1197 assert!(!PerformanceHints::unlikely(false));
1198
1199 let data = [1.0f64; 100];
1201 PerformanceHints::prefetch_read(&data[0]);
1202
1203 let mut data_mut = [0.0f64; 100];
1204 PerformanceHints::prefetch_write(&mut data_mut[0]);
1205
1206 PerformanceHints::prefetch_with_locality(&data[0], Locality::High);
1208 PerformanceHints::prefetch_with_locality(&data[0], Locality::Medium);
1209 PerformanceHints::prefetch_with_locality(&data[0], Locality::Low);
1210 PerformanceHints::prefetch_with_locality(&data[0], Locality::None);
1211 }
1212
1213 #[test]
1214 fn test_cache_operations() {
1215 let data = [1.0f64; 8];
1216
1217 PerformanceHints::flush_cache_line(&data[0]);
1219
1220 PerformanceHints::memory_fence();
1222
1223 let src = vec![1.0f64; 64];
1225 let mut dst = vec![0.0f64; 64];
1226 PerformanceHints::cache_aware_copy(&src, &mut dst);
1227 assert_eq!(src, dst);
1228
1229 let mut data = vec![0.0f64; 64];
1231 PerformanceHints::cache_aware_memset(&mut data, 5.0);
1232 assert!(data.iter().all(|&x| x == 5.0));
1233 }
1234
1235 #[test]
1236 fn test_locality_enum() {
1237 let localities = [
1239 Locality::High,
1240 Locality::Medium,
1241 Locality::Low,
1242 Locality::None,
1243 ];
1244
1245 for locality in &localities {
1246 let data = 42i32;
1248 PerformanceHints::prefetch_with_locality(&data, *locality);
1249 }
1250
1251 assert_eq!(Locality::High, Locality::High);
1253 assert_ne!(Locality::High, Locality::Low);
1254
1255 assert!(format!("{:?}", Locality::High).contains("High"));
1257 }
1258
1259 #[test]
1260 fn test_strategy_selector() {
1261 let mut selector = StrategySelector::default();
1262
1263 let strategy = selector.select_strategy(1000, false);
1265 assert!(matches!(
1266 strategy,
1267 OptimizationStrategy::Simd
1268 | OptimizationStrategy::Scalar
1269 | OptimizationStrategy::Parallel
1270 | OptimizationStrategy::Gpu
1271 ));
1272
1273 selector.update_weights(OptimizationStrategy::Simd, 0.8);
1275 selector.update_weights(OptimizationStrategy::Parallel, 0.9);
1276
1277 assert!(selector.strategy_weights[&OptimizationStrategy::Simd] != 1.0);
1279 assert!(selector.strategy_weights[&OptimizationStrategy::Parallel] != 1.0);
1280 }
1281
1282 #[test]
1283 fn test_adaptive_optimizer_enhanced() {
1284 let mut optimizer = AdaptiveOptimizer::new();
1285
1286 assert!(!optimizer.should_use_gpu(1000));
1288
1289 let strategy = optimizer.select_optimal_strategy("matrix_multiply", 50_000);
1291 assert!(matches!(
1292 strategy,
1293 OptimizationStrategy::Parallel
1294 | OptimizationStrategy::Simd
1295 | OptimizationStrategy::Scalar
1296 | OptimizationStrategy::Gpu
1297 | OptimizationStrategy::Hybrid
1298 | OptimizationStrategy::CacheOptimized
1299 | OptimizationStrategy::MemoryBound
1300 | OptimizationStrategy::ComputeBound
1301 | OptimizationStrategy::ModernArchOptimized
1302 | OptimizationStrategy::VectorOptimized
1303 | OptimizationStrategy::EnergyEfficient
1304 | OptimizationStrategy::HighThroughput
1305 ));
1306
1307 optimizer.record_performance("test_op", 1000, OptimizationStrategy::Simd, 1_000_000);
1309
1310 let advice = optimizer.analyze_operation("matrix_multiply", 10_000);
1312 assert!(matches!(
1313 advice.recommended_strategy,
1314 OptimizationStrategy::Parallel
1315 | OptimizationStrategy::Simd
1316 | OptimizationStrategy::Scalar
1317 | OptimizationStrategy::Gpu
1318 | OptimizationStrategy::Hybrid
1319 | OptimizationStrategy::CacheOptimized
1320 | OptimizationStrategy::MemoryBound
1321 | OptimizationStrategy::ComputeBound
1322 | OptimizationStrategy::ModernArchOptimized
1323 | OptimizationStrategy::VectorOptimized
1324 | OptimizationStrategy::EnergyEfficient
1325 | OptimizationStrategy::HighThroughput
1326 ));
1327
1328 let metrics = optimizer.get_performance_metrics();
1330 assert!(metrics.is_some());
1331 }
1332
1333 #[test]
1334 fn test_optimization_strategy_enum() {
1335 let strategies = [
1337 OptimizationStrategy::Scalar,
1338 OptimizationStrategy::Simd,
1339 OptimizationStrategy::Parallel,
1340 OptimizationStrategy::Gpu,
1341 OptimizationStrategy::Hybrid,
1342 OptimizationStrategy::CacheOptimized,
1343 OptimizationStrategy::MemoryBound,
1344 OptimizationStrategy::ComputeBound,
1345 ];
1346
1347 for strategy in &strategies {
1348 assert!(!format!("{strategy:?}").is_empty());
1350
1351 assert_eq!(*strategy, *strategy);
1353 }
1354 }
1355
1356 #[test]
1357 fn test_performance_metrics() {
1358 let mut metrics = PerformanceMetrics::default();
1359
1360 metrics
1362 .operation_times
1363 .insert("test_op".to_string(), 1000.0);
1364 assert_eq!(metrics.operation_times["test_op"], 1000.0);
1365
1366 metrics
1368 .strategy_success_rates
1369 .insert(OptimizationStrategy::Simd, 0.85);
1370 assert_eq!(
1371 metrics.strategy_success_rates[&OptimizationStrategy::Simd],
1372 0.85
1373 );
1374
1375 metrics.memorybandwidth_utilization = 0.75;
1377 metrics.cache_hit_rate = 0.90;
1378 metrics.parallel_efficiency = 0.80;
1379
1380 assert_eq!(metrics.memorybandwidth_utilization, 0.75);
1381 assert_eq!(metrics.cache_hit_rate, 0.90);
1382 assert_eq!(metrics.parallel_efficiency, 0.80);
1383 }
1384
1385 #[test]
1386 fn test_optimization_advice() {
1387 let advice = OptimizationAdvice {
1388 recommended_strategy: OptimizationStrategy::Parallel,
1389 optimal_chunk_size: Some(1024),
1390 prefetch_distance: Some(64),
1391 memory_allocation_hint: Some("Use memory mapping".to_string()),
1392 };
1393
1394 assert_eq!(advice.recommended_strategy, OptimizationStrategy::Parallel);
1395 assert_eq!(advice.optimal_chunk_size, Some(1024));
1396 assert_eq!(advice.prefetch_distance, Some(64));
1397 assert!(advice.memory_allocation_hint.is_some());
1398
1399 assert!(!format!("{advice:?}").is_empty());
1401 }
1402
1403 #[test]
1404 #[ignore = "timeout"]
1405 fn test_benchmarking_config() {
1406 let config = benchmarking::BenchmarkConfig::default();
1407
1408 assert_eq!(config.warmup_iterations, 5);
1409 assert_eq!(config.measurement_iterations, 20);
1410 assert!(!config.sample_sizes.is_empty());
1411 assert!(!config.strategies.is_empty());
1412
1413 let array_config = benchmarking::presets::array_operations();
1415 assert_eq!(array_config.warmup_iterations, 3);
1416 assert_eq!(array_config.measurement_iterations, 10);
1417
1418 let matrix_config = benchmarking::presets::matrix_operations();
1419 assert_eq!(matrix_config.warmup_iterations, 5);
1420 assert_eq!(matrix_config.measurement_iterations, 15);
1421
1422 let memory_config = benchmarking::presets::memory_intensive();
1423 assert_eq!(memory_config.warmup_iterations, 2);
1424 assert_eq!(memory_config.measurement_iterations, 8);
1425 }
1426
1427 #[test]
1428 #[ignore = "timeout"]
1429 fn test_benchmark_measurement() {
1430 let measurement = benchmarking::BenchmarkMeasurement {
1431 duration: Duration::from_millis(5),
1432 strategy: OptimizationStrategy::Simd,
1433 input_size: 1000,
1434 throughput: 200_000.0,
1435 memory_usage: 8000,
1436 custom_metrics: std::collections::HashMap::new(),
1437 };
1438
1439 assert_eq!(measurement.strategy, OptimizationStrategy::Simd);
1440 assert_eq!(measurement.input_size, 1000);
1441 assert_eq!(measurement.throughput, 200_000.0);
1442 assert_eq!(measurement.memory_usage, 8000);
1443 }
1444
1445 #[test]
1446 #[ignore = "timeout"]
1447 fn test_benchmark_runner() {
1448 let config = benchmarking::BenchmarkConfig {
1449 warmup_iterations: 1,
1450 measurement_iterations: 2,
1451 min_duration: Duration::from_millis(1),
1452 max_duration: Duration::from_secs(1),
1453 sample_sizes: vec![10, 100],
1454 strategies: vec![OptimizationStrategy::Scalar, OptimizationStrategy::Simd],
1455 };
1456
1457 let runner = benchmarking::BenchmarkRunner::new(config);
1458
1459 let results = runner.benchmark_operation("test_add", |data, _strategy| {
1461 let result: Vec<f64> = data.iter().map(|x| *x + 1.0).collect();
1462 (Duration::from_millis(1), result)
1463 });
1464
1465 assert!(!results.measurements.is_empty());
1466 }
1467
1468 #[test]
1469 fn test_strategy_performance() {
1470 let performance = benchmarking::StrategyPerformance {
1471 avg_throughput: 150_000.0,
1472 throughput_stddev: 5_000.0,
1473 avg_memory_usage: 8000.0,
1474 optimal_size: 10_000,
1475 efficiency_score: 0.85,
1476 };
1477
1478 assert_eq!(performance.avg_throughput, 150_000.0);
1479 assert_eq!(performance.throughput_stddev, 5_000.0);
1480 assert_eq!(performance.optimal_size, 10_000);
1481 assert_eq!(performance.efficiency_score, 0.85);
1482 }
1483
1484 #[test]
1485 fn test_scalability_analysis() {
1486 let mut parallel_efficiency = std::collections::HashMap::new();
1487 parallel_efficiency.insert(1000, 0.8);
1488 parallel_efficiency.insert(10000, 0.9);
1489
1490 let memory_scaling = benchmarking::MemoryScaling {
1491 linear_coefficient: 8.0,
1492 constant_coefficient: 1024.0,
1493 r_squared: 0.95,
1494 };
1495
1496 let bottleneck = benchmarking::PerformanceBottleneck {
1497 bottleneck_type: benchmarking::BottleneckType::MemoryBandwidth,
1498 size_range: (10000, 10000),
1499 impact: 0.3,
1500 mitigation: "Use memory prefetching".to_string(),
1501 };
1502
1503 let analysis = benchmarking::ScalabilityAnalysis {
1504 parallel_efficiency,
1505 memory_scaling,
1506 bottlenecks: vec![bottleneck],
1507 };
1508
1509 assert_eq!(analysis.parallel_efficiency[&1000], 0.8);
1510 assert_eq!(analysis.memory_scaling.linear_coefficient, 8.0);
1511 assert_eq!(analysis.bottlenecks.len(), 1);
1512 assert_eq!(
1513 analysis.bottlenecks[0].bottleneck_type,
1514 benchmarking::BottleneckType::MemoryBandwidth
1515 );
1516 }
1517
1518 #[test]
1519 fn test_memory_scaling() {
1520 let scaling = benchmarking::MemoryScaling {
1521 linear_coefficient: 8.0,
1522 constant_coefficient: 512.0,
1523 r_squared: 0.99,
1524 };
1525
1526 assert_eq!(scaling.linear_coefficient, 8.0);
1527 assert_eq!(scaling.constant_coefficient, 512.0);
1528 assert_eq!(scaling.r_squared, 0.99);
1529 }
1530
1531 #[test]
1532 fn test_performance_bottleneck() {
1533 let bottleneck = benchmarking::PerformanceBottleneck {
1534 bottleneck_type: benchmarking::BottleneckType::SynchronizationOverhead,
1535 size_range: (1000, 5000),
1536 impact: 0.6,
1537 mitigation: "Reduce thread contention".to_string(),
1538 };
1539
1540 assert_eq!(
1541 bottleneck.bottleneck_type,
1542 benchmarking::BottleneckType::SynchronizationOverhead
1543 );
1544 assert_eq!(bottleneck.size_range, (1000, 5000));
1545 assert_eq!(bottleneck.impact, 0.6);
1546 assert_eq!(bottleneck.mitigation, "Reduce thread contention");
1547 }
1548
1549 #[test]
1550 fn test_bottleneck_type_enum() {
1551 let bottleneck_types = [
1552 benchmarking::BottleneckType::MemoryBandwidth,
1553 benchmarking::BottleneckType::CacheLatency,
1554 benchmarking::BottleneckType::ComputeBound,
1555 benchmarking::BottleneckType::SynchronizationOverhead,
1556 benchmarking::BottleneckType::AlgorithmicComplexity,
1557 ];
1558
1559 for bottleneck_type in &bottleneck_types {
1560 assert!(!format!("{bottleneck_type:?}").is_empty());
1562
1563 assert_eq!(*bottleneck_type, *bottleneck_type);
1565 }
1566
1567 assert_ne!(
1569 benchmarking::BottleneckType::MemoryBandwidth,
1570 benchmarking::BottleneckType::CacheLatency
1571 );
1572 }
1573
1574 #[test]
1575 #[ignore = "timeout"]
1576 fn test_benchmark_results() {
1577 let measurement = benchmarking::BenchmarkMeasurement {
1578 strategy: OptimizationStrategy::Parallel,
1579 input_size: 1000,
1580 duration: Duration::from_millis(10),
1581 throughput: 100_000.0,
1582 memory_usage: 8000,
1583 custom_metrics: std::collections::HashMap::new(),
1584 };
1585
1586 let mut strategy_summary = std::collections::HashMap::new();
1587 strategy_summary.insert(
1588 OptimizationStrategy::Parallel,
1589 benchmarking::StrategyPerformance {
1590 avg_throughput: 100_000.0,
1591 throughput_stddev: 1_000.0,
1592 avg_memory_usage: 8000.0,
1593 optimal_size: 1000,
1594 efficiency_score: 0.9,
1595 },
1596 );
1597
1598 let scalability_analysis = benchmarking::ScalabilityAnalysis {
1599 parallel_efficiency: std::collections::HashMap::new(),
1600 memory_scaling: benchmarking::MemoryScaling {
1601 linear_coefficient: 8.0,
1602 constant_coefficient: 0.0,
1603 r_squared: 1.0,
1604 },
1605 bottlenecks: Vec::new(),
1606 };
1607
1608 let results = benchmarking::BenchmarkResults {
1609 operation_name: "test_operation".to_string(),
1610 measurements: vec![measurement],
1611 strategy_summary,
1612 scalability_analysis,
1613 recommendations: vec!["Use parallel strategy".to_string()],
1614 total_duration: Duration::from_millis(100),
1615 };
1616
1617 assert_eq!(results.operation_name, "test_operation");
1618 assert_eq!(results.measurements.len(), 1);
1619 assert_eq!(results.strategy_summary.len(), 1);
1620 assert_eq!(results.recommendations.len(), 1);
1621 assert_eq!(results.total_duration, Duration::from_millis(100));
1622 }
1623
1624 #[test]
1625 fn test_modern_architecture_detection() {
1626 let zen4_detected = is_zen4_or_newer();
1628 let golden_cove_detected = is_intel_golden_cove_or_newer();
1629 let apple_silicon_detected = is_apple_silicon();
1630 let neoverse_detected = is_neoverse_or_newer();
1631
1632 }
1635
1636 #[test]
1637 fn test_enhanced_strategy_selector() {
1638 let selector = StrategySelector::default();
1639
1640 assert!(selector
1642 .strategy_weights
1643 .contains_key(&OptimizationStrategy::ModernArchOptimized));
1644 assert!(selector
1645 .strategy_weights
1646 .contains_key(&OptimizationStrategy::VectorOptimized));
1647 assert!(selector
1648 .strategy_weights
1649 .contains_key(&OptimizationStrategy::EnergyEfficient));
1650 assert!(selector
1651 .strategy_weights
1652 .contains_key(&OptimizationStrategy::HighThroughput));
1653
1654 let modern_weight = selector
1656 .strategy_weights
1657 .get(&OptimizationStrategy::ModernArchOptimized)
1658 .unwrap();
1659 let scalar_weight = selector
1660 .strategy_weights
1661 .get(&OptimizationStrategy::Scalar)
1662 .unwrap();
1663 assert!(modern_weight > scalar_weight);
1664 }
1665
1666 #[test]
1667 fn test_enhanced_strategy_selection() {
1668 let selector = StrategySelector::default();
1669
1670 let small_strategy = selector.select_strategy(100, false);
1672 assert!(matches!(
1673 small_strategy,
1674 OptimizationStrategy::Scalar
1675 | OptimizationStrategy::EnergyEfficient
1676 | OptimizationStrategy::ModernArchOptimized
1677 ));
1678
1679 let large_strategy = selector.select_strategy(1_000_000, false);
1681 assert!(matches!(
1682 large_strategy,
1683 OptimizationStrategy::HighThroughput
1684 | OptimizationStrategy::VectorOptimized
1685 | OptimizationStrategy::Parallel
1686 ));
1687
1688 let memory_bound_strategy = selector.select_strategy(10_000, true);
1690 assert!(matches!(
1691 memory_bound_strategy,
1692 OptimizationStrategy::MemoryBound | OptimizationStrategy::ModernArchOptimized
1693 ));
1694 }
1695
1696 #[test]
1697 #[cfg(feature = "benchmarking")]
1698 #[ignore = "timeout"]
1699 fn test_advanced_benchmark_config() {
1700 let config = benchmarking::presets::advanced_comprehensive();
1701
1702 assert!(config
1704 .strategies
1705 .contains(&OptimizationStrategy::ModernArchOptimized));
1706 assert!(config
1707 .strategies
1708 .contains(&OptimizationStrategy::VectorOptimized));
1709 assert!(config
1710 .strategies
1711 .contains(&OptimizationStrategy::EnergyEfficient));
1712 assert!(config
1713 .strategies
1714 .contains(&OptimizationStrategy::HighThroughput));
1715
1716 assert!(config.sample_sizes.len() >= 10);
1718 assert!(config.sample_sizes.contains(&100));
1719 assert!(config.sample_sizes.contains(&5_000_000));
1720
1721 assert!(config.measurement_iterations >= 25);
1723 assert!(config.warmup_iterations >= 10);
1724 }
1725
1726 #[test]
1727 #[cfg(feature = "benchmarking")]
1728 #[ignore = "timeout"]
1729 fn test_modern_architecture_benchmark_config() {
1730 let config = benchmarking::presets::modern_architectures();
1731
1732 assert_eq!(config.strategies.len(), 4);
1734 assert!(config
1735 .strategies
1736 .contains(&OptimizationStrategy::ModernArchOptimized));
1737 assert!(config
1738 .strategies
1739 .contains(&OptimizationStrategy::VectorOptimized));
1740 assert!(config
1741 .strategies
1742 .contains(&OptimizationStrategy::HighThroughput));
1743 assert!(config
1744 .strategies
1745 .contains(&OptimizationStrategy::EnergyEfficient));
1746
1747 assert!(!config.strategies.contains(&OptimizationStrategy::Scalar));
1749 }
1750
1751 #[test]
1752 fn test_enhanced_cache_line_detection() {
1753 let optimizer = AdaptiveOptimizer::new();
1754 let cache_line_size = optimizer.cache_line_size;
1755
1756 assert!(cache_line_size == 64 || cache_line_size == 128);
1758
1759 assert_eq!(cache_line_size & (cache_line_size - 1), 0);
1761 }
1762
1763 #[test]
1764 fn test_strategy_weight_updates() {
1765 let mut selector = StrategySelector::default();
1766 let initial_weight = *selector
1767 .strategy_weights
1768 .get(&OptimizationStrategy::ModernArchOptimized)
1769 .unwrap();
1770
1771 selector.update_weights(OptimizationStrategy::ModernArchOptimized, 0.9);
1773 let updated_weight = *selector
1774 .strategy_weights
1775 .get(&OptimizationStrategy::ModernArchOptimized)
1776 .unwrap();
1777
1778 assert_ne!(initial_weight, updated_weight);
1780 }
1781}