Skip to main content

sklears_simd/
profiling.rs

1//! Performance analysis and profiling tools
2//!
3//! This module provides comprehensive profiling capabilities for SIMD operations,
4//! including instruction-level profiling, cache analysis, and vectorization efficiency metrics.
5
6#[cfg(not(feature = "no-std"))]
7use std::{
8    collections::HashMap,
9    string::ToString,
10    sync::atomic::{AtomicU64, Ordering},
11    time::{Duration, Instant},
12};
13
14#[cfg(feature = "no-std")]
15use alloc::{
16    collections::BTreeMap as HashMap,
17    format,
18    string::{String, ToString},
19    vec,
20    vec::Vec,
21};
22#[cfg(feature = "no-std")]
23use core::sync::atomic::{AtomicU64, Ordering};
24
25// Type aliases for conditional compilation (reusing from performance_hooks)
26#[cfg(feature = "no-std")]
27#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
28pub struct Duration(u64); // Mock duration in microseconds
29#[cfg(feature = "no-std")]
30#[derive(Debug, Clone, Copy)]
31pub struct Instant; // Mock instant stub for no-std
32
33#[cfg(feature = "no-std")]
34impl Instant {
35    pub fn now() -> Self {
36        Instant // Mock implementation
37    }
38
39    pub fn elapsed(&self) -> Duration {
40        Duration(0) // Mock implementation
41    }
42}
43
44#[cfg(feature = "no-std")]
45impl Duration {
46    pub fn as_nanos(&self) -> u128 {
47        self.0 as u128 * 1000 // Mock implementation
48    }
49
50    pub fn from_nanos(nanos: u64) -> Self {
51        Duration(nanos / 1000) // Convert nanos to mock microseconds
52    }
53
54    pub fn from_micros(micros: u64) -> Self {
55        Duration(micros) // Mock implementation - directly use microseconds
56    }
57
58    pub fn as_micros(&self) -> u128 {
59        self.0 as u128 // Mock implementation
60    }
61}
62
63#[cfg(feature = "no-std")]
64impl core::ops::Add for Duration {
65    type Output = Duration;
66
67    fn add(self, rhs: Duration) -> Self::Output {
68        Duration(self.0 + rhs.0)
69    }
70}
71
72#[cfg(feature = "no-std")]
73impl core::ops::Div<u32> for Duration {
74    type Output = Duration;
75
76    fn div(self, rhs: u32) -> Self::Output {
77        Duration(self.0 / rhs as u64)
78    }
79}
80
81#[cfg(feature = "no-std")]
82impl core::iter::Sum for Duration {
83    fn sum<I: Iterator<Item = Duration>>(iter: I) -> Self {
84        Duration(iter.map(|d| d.0).sum())
85    }
86}
87
88#[cfg(feature = "no-std")]
89impl<'a> core::iter::Sum<&'a Duration> for Duration {
90    fn sum<I: Iterator<Item = &'a Duration>>(iter: I) -> Self {
91        Duration(iter.map(|d| d.0).sum())
92    }
93}
94
95/// Performance counter for tracking SIMD operation metrics
96#[derive(Debug, Clone)]
97pub struct SimdProfiler {
98    /// Execution time measurements for different operations
99    operation_times: HashMap<String, Vec<Duration>>,
100    /// Instruction counts for SIMD vs scalar operations
101    instruction_counts: HashMap<String, InstructionCount>,
102    /// Cache performance metrics
103    cache_metrics: CacheMetrics,
104    /// Vectorization efficiency tracking
105    vectorization_metrics: VectorizationMetrics,
106}
107
108/// Instruction count tracking for performance analysis
109#[derive(Debug, Clone, Default)]
110pub struct InstructionCount {
111    /// Number of SIMD instructions executed
112    pub simd_instructions: u64,
113    /// Number of scalar instructions executed
114    pub scalar_instructions: u64,
115    /// Number of memory load operations
116    pub memory_loads: u64,
117    /// Number of memory store operations
118    pub memory_stores: u64,
119    /// Number of branch instructions
120    pub branches: u64,
121}
122
123/// Cache performance metrics
124#[derive(Debug, Clone, Default)]
125pub struct CacheMetrics {
126    /// L1 cache hit rate (0.0 - 1.0)
127    pub l1_hit_rate: f64,
128    /// L2 cache hit rate (0.0 - 1.0)
129    pub l2_hit_rate: f64,
130    /// L3 cache hit rate (0.0 - 1.0)
131    pub l3_hit_rate: f64,
132    /// Total cache misses
133    pub total_misses: u64,
134    /// Memory bandwidth utilization (bytes/second)
135    pub bandwidth_utilization: f64,
136}
137
138/// Vectorization efficiency metrics
139#[derive(Debug, Clone, Default)]
140pub struct VectorizationMetrics {
141    /// Percentage of operations that were vectorized (0.0 - 1.0)
142    pub vectorization_rate: f64,
143    /// SIMD lane utilization efficiency (0.0 - 1.0)
144    pub lane_utilization: f64,
145    /// Theoretical vs actual throughput ratio
146    pub throughput_efficiency: f64,
147    /// Number of elements processed per SIMD operation
148    pub elements_per_operation: f64,
149}
150
151/// Performance bottleneck identification
152#[derive(Debug, Clone)]
153pub struct BottleneckAnalysis {
154    /// Primary bottleneck type
155    pub primary_bottleneck: BottleneckType,
156    /// Performance limiters in order of impact
157    pub limiters: Vec<(BottleneckType, f64)>,
158    /// Optimization recommendations
159    pub recommendations: Vec<String>,
160}
161
162/// Types of performance bottlenecks
163#[derive(Debug, Clone, PartialEq)]
164pub enum BottleneckType {
165    /// Computation-bound (CPU intensive)
166    Compute,
167    /// Memory bandwidth limited
168    MemoryBandwidth,
169    /// Memory latency limited
170    MemoryLatency,
171    /// Cache miss limited
172    CacheMiss,
173    /// Branch prediction limited
174    BranchPrediction,
175    /// SIMD lane underutilization
176    SimdUnderutilization,
177    /// Instruction dependency chains
178    InstructionDependency,
179}
180
181/// Global performance counter for thread-safe profiling
182static GLOBAL_OPERATION_COUNT: AtomicU64 = AtomicU64::new(0);
183static GLOBAL_SIMD_COUNT: AtomicU64 = AtomicU64::new(0);
184static GLOBAL_SCALAR_COUNT: AtomicU64 = AtomicU64::new(0);
185
186impl Default for SimdProfiler {
187    fn default() -> Self {
188        Self::new()
189    }
190}
191
192impl SimdProfiler {
193    /// Create a new SIMD profiler instance
194    pub fn new() -> Self {
195        Self {
196            operation_times: HashMap::new(),
197            instruction_counts: HashMap::new(),
198            cache_metrics: CacheMetrics::default(),
199            vectorization_metrics: VectorizationMetrics::default(),
200        }
201    }
202
203    /// Start profiling a SIMD operation
204    pub fn start_operation(&mut self, operation_name: &str) -> OperationProfiler {
205        OperationProfiler::new(operation_name.to_string())
206    }
207
208    /// Record execution time for an operation
209    pub fn record_time(&mut self, operation: &str, duration: Duration) {
210        self.operation_times
211            .entry(operation.to_string())
212            .or_default()
213            .push(duration);
214    }
215
216    /// Record instruction counts for an operation
217    pub fn record_instructions(&mut self, operation: &str, counts: InstructionCount) {
218        self.instruction_counts
219            .insert(operation.to_string(), counts);
220    }
221
222    /// Update cache metrics
223    pub fn update_cache_metrics(&mut self, metrics: CacheMetrics) {
224        self.cache_metrics = metrics;
225    }
226
227    /// Update vectorization metrics
228    pub fn update_vectorization_metrics(&mut self, metrics: VectorizationMetrics) {
229        self.vectorization_metrics = metrics;
230    }
231
232    /// Get average execution time for an operation
233    pub fn average_time(&self, operation: &str) -> Option<Duration> {
234        self.operation_times.get(operation).map(|times| {
235            let total: Duration = times.iter().sum();
236            total / times.len() as u32
237        })
238    }
239
240    /// Get operation statistics
241    pub fn get_statistics(&self, operation: &str) -> Option<OperationStats> {
242        self.operation_times.get(operation).map(|times| {
243            let count = times.len();
244            let total: Duration = times.iter().sum();
245            let average = total / count as u32;
246
247            let mut sorted_times = times.clone();
248            sorted_times.sort();
249
250            let median = if count % 2 == 0 {
251                (sorted_times[count / 2 - 1] + sorted_times[count / 2]) / 2
252            } else {
253                sorted_times[count / 2]
254            };
255
256            let min = *sorted_times
257                .first()
258                .expect("collection should not be empty");
259            let max = *sorted_times.last().expect("collection should not be empty");
260
261            OperationStats {
262                count,
263                total,
264                average,
265                median,
266                min,
267                max,
268                std_deviation: self.calculate_std_deviation(times, average),
269            }
270        })
271    }
272
273    /// Calculate standard deviation of execution times
274    fn calculate_std_deviation(&self, times: &[Duration], average: Duration) -> Duration {
275        if times.len() <= 1 {
276            return Duration::from_nanos(0);
277        }
278
279        let variance: f64 = times
280            .iter()
281            .map(|&time| {
282                let diff = time.as_nanos() as f64 - average.as_nanos() as f64;
283                diff * diff
284            })
285            .sum::<f64>()
286            / times.len() as f64;
287
288        Duration::from_nanos(variance.sqrt() as u64)
289    }
290
291    /// Analyze performance bottlenecks
292    pub fn analyze_bottlenecks(&self) -> BottleneckAnalysis {
293        let mut limiters = Vec::new();
294
295        // Analyze vectorization efficiency
296        if self.vectorization_metrics.vectorization_rate < 0.7 {
297            limiters.push((
298                BottleneckType::SimdUnderutilization,
299                1.0 - self.vectorization_metrics.vectorization_rate,
300            ));
301        }
302
303        // Analyze cache performance
304        if self.cache_metrics.l1_hit_rate < 0.9 {
305            limiters.push((
306                BottleneckType::CacheMiss,
307                1.0 - self.cache_metrics.l1_hit_rate,
308            ));
309        }
310
311        // Analyze memory bandwidth utilization
312        if self.cache_metrics.bandwidth_utilization < 0.8 {
313            limiters.push((
314                BottleneckType::MemoryBandwidth,
315                1.0 - self.cache_metrics.bandwidth_utilization,
316            ));
317        }
318
319        // Sort limiters by impact
320        limiters.sort_by(|a, b| b.1.partial_cmp(&a.1).expect("operation should succeed"));
321
322        let primary_bottleneck = limiters
323            .first()
324            .map(|(t, _)| t.clone())
325            .unwrap_or(BottleneckType::Compute);
326
327        let recommendations = self.generate_recommendations(&limiters);
328
329        BottleneckAnalysis {
330            primary_bottleneck,
331            limiters,
332            recommendations,
333        }
334    }
335
336    /// Generate optimization recommendations based on bottleneck analysis
337    fn generate_recommendations(&self, limiters: &[(BottleneckType, f64)]) -> Vec<String> {
338        let mut recommendations = Vec::new();
339
340        for (bottleneck_type, impact) in limiters {
341            match bottleneck_type {
342                BottleneckType::SimdUnderutilization => {
343                    recommendations.push(format!(
344                        "Improve SIMD utilization (current: {:.1}%): Consider wider SIMD instructions or better data layout",
345                        self.vectorization_metrics.vectorization_rate * 100.0
346                    ));
347                }
348                BottleneckType::CacheMiss => {
349                    recommendations.push(format!(
350                        "Reduce cache misses (impact: {:.1}%): Improve data locality or use cache-friendly algorithms",
351                        impact * 100.0
352                    ));
353                }
354                BottleneckType::MemoryBandwidth => {
355                    recommendations.push(format!(
356                        "Optimize memory bandwidth (utilization: {:.1}%): Use prefetching or reduce memory traffic",
357                        self.cache_metrics.bandwidth_utilization * 100.0
358                    ));
359                }
360                BottleneckType::BranchPrediction => {
361                    recommendations.push(
362                        "Reduce branching: Use branchless algorithms or improve predictability"
363                            .to_string(),
364                    );
365                }
366                _ => {}
367            }
368        }
369
370        recommendations
371    }
372
373    /// Generate comprehensive performance report
374    pub fn generate_report(&self) -> String {
375        let mut report = String::new();
376        report.push_str("=== SIMD Performance Analysis Report ===\n\n");
377
378        // Operation timing summary
379        report.push_str("## Operation Performance Summary\n");
380        for operation in self.operation_times.keys() {
381            if let Some(stats) = self.get_statistics(operation) {
382                report.push_str(&format!(
383                    "{}: avg={:.2}μs, min={:.2}μs, max={:.2}μs, count={}\n",
384                    operation,
385                    stats.average.as_micros(),
386                    stats.min.as_micros(),
387                    stats.max.as_micros(),
388                    stats.count
389                ));
390            }
391        }
392
393        // Vectorization metrics
394        report.push_str(&format!(
395            "\n## Vectorization Efficiency\n\
396            Vectorization Rate: {:.1}%\n\
397            Lane Utilization: {:.1}%\n\
398            Throughput Efficiency: {:.1}%\n",
399            self.vectorization_metrics.vectorization_rate * 100.0,
400            self.vectorization_metrics.lane_utilization * 100.0,
401            self.vectorization_metrics.throughput_efficiency * 100.0
402        ));
403
404        // Cache performance
405        report.push_str(&format!(
406            "\n## Cache Performance\n\
407            L1 Hit Rate: {:.1}%\n\
408            L2 Hit Rate: {:.1}%\n\
409            L3 Hit Rate: {:.1}%\n\
410            Bandwidth Utilization: {:.1}%\n",
411            self.cache_metrics.l1_hit_rate * 100.0,
412            self.cache_metrics.l2_hit_rate * 100.0,
413            self.cache_metrics.l3_hit_rate * 100.0,
414            self.cache_metrics.bandwidth_utilization * 100.0
415        ));
416
417        // Bottleneck analysis
418        let analysis = self.analyze_bottlenecks();
419        report.push_str(&format!(
420            "\n## Bottleneck Analysis\n\
421            Primary Bottleneck: {:?}\n",
422            analysis.primary_bottleneck
423        ));
424
425        report.push_str("\n## Optimization Recommendations\n");
426        for (i, recommendation) in analysis.recommendations.iter().enumerate() {
427            report.push_str(&format!("{}. {}\n", i + 1, recommendation));
428        }
429
430        report
431    }
432}
433
434/// Statistics for a specific operation
435#[derive(Debug, Clone)]
436pub struct OperationStats {
437    pub count: usize,
438    pub total: Duration,
439    pub average: Duration,
440    pub median: Duration,
441    pub min: Duration,
442    pub max: Duration,
443    pub std_deviation: Duration,
444}
445
446/// Individual operation profiler for timing measurements
447pub struct OperationProfiler {
448    #[allow(dead_code)] // Stored for future finish() enrichment (e.g. including name in result)
449    operation_name: String,
450    start_time: Instant,
451    instruction_count: InstructionCount,
452}
453
454impl OperationProfiler {
455    /// Create a new operation profiler
456    pub fn new(operation_name: String) -> Self {
457        GLOBAL_OPERATION_COUNT.fetch_add(1, Ordering::Relaxed);
458
459        Self {
460            operation_name,
461            start_time: Instant::now(),
462            instruction_count: InstructionCount::default(),
463        }
464    }
465
466    /// Record a SIMD instruction execution
467    pub fn record_simd_instruction(&mut self) {
468        self.instruction_count.simd_instructions += 1;
469        GLOBAL_SIMD_COUNT.fetch_add(1, Ordering::Relaxed);
470    }
471
472    /// Record a scalar instruction execution
473    pub fn record_scalar_instruction(&mut self) {
474        self.instruction_count.scalar_instructions += 1;
475        GLOBAL_SCALAR_COUNT.fetch_add(1, Ordering::Relaxed);
476    }
477
478    /// Record memory operations
479    pub fn record_memory_load(&mut self) {
480        self.instruction_count.memory_loads += 1;
481    }
482
483    pub fn record_memory_store(&mut self) {
484        self.instruction_count.memory_stores += 1;
485    }
486
487    /// Finish profiling and return results
488    pub fn finish(self) -> (Duration, InstructionCount) {
489        let duration = self.start_time.elapsed();
490        (duration, self.instruction_count)
491    }
492}
493
494/// Cache-aware algorithm performance analyzer
495pub struct CacheAnalyzer {
496    /// Cache sizes for different levels (in bytes)
497    cache_sizes: Vec<usize>,
498    /// Cache line size (typically 64 bytes)
499    cache_line_size: usize,
500}
501
502impl Default for CacheAnalyzer {
503    fn default() -> Self {
504        Self::new()
505    }
506}
507
508impl CacheAnalyzer {
509    /// Create a new cache analyzer with typical x86-64 cache hierarchy
510    pub fn new() -> Self {
511        Self {
512            cache_sizes: vec![32 * 1024, 256 * 1024, 8 * 1024 * 1024], // L1, L2, L3
513            cache_line_size: 64,
514        }
515    }
516
517    /// Analyze cache efficiency for a given data access pattern
518    pub fn analyze_access_pattern(&self, data_size: usize, stride: usize) -> CacheAnalysis {
519        let cache_lines_accessed = data_size.div_ceil(self.cache_line_size);
520
521        // Estimate cache misses based on stride and cache sizes
522        let l1_working_set = cache_lines_accessed * self.cache_line_size;
523        let l1_fit = l1_working_set <= self.cache_sizes[0];
524        let l2_fit = l1_working_set <= self.cache_sizes[1];
525        let l3_fit = l1_working_set <= self.cache_sizes[2];
526
527        let estimated_l1_hit_rate = if l1_fit { 0.95 } else { 0.1 };
528        let estimated_l2_hit_rate = if l2_fit { 0.9 } else { 0.3 };
529        let estimated_l3_hit_rate = if l3_fit { 0.8 } else { 0.1 };
530
531        CacheAnalysis {
532            l1_hit_rate: estimated_l1_hit_rate,
533            l2_hit_rate: estimated_l2_hit_rate,
534            l3_hit_rate: estimated_l3_hit_rate,
535            cache_lines_accessed,
536            working_set_size: l1_working_set,
537            stride_efficiency: self.calculate_stride_efficiency(stride),
538        }
539    }
540
541    /// Calculate stride efficiency (how cache-friendly the access pattern is)
542    fn calculate_stride_efficiency(&self, stride: usize) -> f64 {
543        if stride <= self.cache_line_size {
544            1.0 // Perfect locality
545        } else if stride <= self.cache_line_size * 2 {
546            0.8 // Good locality
547        } else if stride <= self.cache_line_size * 4 {
548            0.6 // Moderate locality
549        } else {
550            0.3 // Poor locality
551        }
552    }
553}
554
555/// Cache analysis results
556#[derive(Debug, Clone)]
557pub struct CacheAnalysis {
558    pub l1_hit_rate: f64,
559    pub l2_hit_rate: f64,
560    pub l3_hit_rate: f64,
561    pub cache_lines_accessed: usize,
562    pub working_set_size: usize,
563    pub stride_efficiency: f64,
564}
565
566/// Vectorization efficiency analyzer
567pub struct VectorizationAnalyzer;
568
569impl VectorizationAnalyzer {
570    /// Analyze vectorization efficiency for a given operation
571    pub fn analyze_operation(
572        elements_processed: usize,
573        simd_width: usize,
574        actual_simd_ops: usize,
575        scalar_ops: usize,
576    ) -> VectorizationAnalysis {
577        let theoretical_simd_ops = elements_processed.div_ceil(simd_width);
578        let total_ops = actual_simd_ops + scalar_ops;
579
580        let vectorization_rate = if total_ops > 0 {
581            actual_simd_ops as f64 / total_ops as f64
582        } else {
583            0.0
584        };
585
586        let lane_utilization = if actual_simd_ops > 0 {
587            elements_processed as f64 / (actual_simd_ops * simd_width) as f64
588        } else {
589            0.0
590        };
591
592        let throughput_efficiency = if theoretical_simd_ops > 0 {
593            actual_simd_ops as f64 / theoretical_simd_ops as f64
594        } else {
595            0.0
596        };
597
598        VectorizationAnalysis {
599            vectorization_rate,
600            lane_utilization,
601            throughput_efficiency,
602            theoretical_simd_ops,
603            actual_simd_ops,
604            scalar_fallback_ops: scalar_ops,
605        }
606    }
607}
608
609/// Vectorization analysis results
610#[derive(Debug, Clone)]
611pub struct VectorizationAnalysis {
612    pub vectorization_rate: f64,
613    pub lane_utilization: f64,
614    pub throughput_efficiency: f64,
615    pub theoretical_simd_ops: usize,
616    pub actual_simd_ops: usize,
617    pub scalar_fallback_ops: usize,
618}
619
620/// Global profiling statistics
621pub fn get_global_stats() -> GlobalStats {
622    GlobalStats {
623        total_operations: GLOBAL_OPERATION_COUNT.load(Ordering::Relaxed),
624        total_simd_instructions: GLOBAL_SIMD_COUNT.load(Ordering::Relaxed),
625        total_scalar_instructions: GLOBAL_SCALAR_COUNT.load(Ordering::Relaxed),
626    }
627}
628
629/// Global profiling statistics
630#[derive(Debug, Clone)]
631pub struct GlobalStats {
632    pub total_operations: u64,
633    pub total_simd_instructions: u64,
634    pub total_scalar_instructions: u64,
635}
636
637impl GlobalStats {
638    /// Get the SIMD vs scalar instruction ratio
639    pub fn simd_ratio(&self) -> f64 {
640        let total = self.total_simd_instructions + self.total_scalar_instructions;
641        if total > 0 {
642            self.total_simd_instructions as f64 / total as f64
643        } else {
644            0.0
645        }
646    }
647}
648
649#[allow(non_snake_case)]
650#[cfg(all(test, not(feature = "no-std")))]
651mod tests {
652    use super::*;
653    #[cfg(not(feature = "no-std"))]
654    use std::time::Duration;
655
656    #[cfg(feature = "no-std")]
657    use alloc::{
658        string::{String, ToString},
659        vec,
660        vec::Vec,
661    };
662
663    #[test]
664    fn test_profiler_basic_functionality() {
665        let mut profiler = SimdProfiler::new();
666
667        // Record some operation times
668        profiler.record_time("vector_add", Duration::from_micros(10));
669        profiler.record_time("vector_add", Duration::from_micros(12));
670        profiler.record_time("vector_add", Duration::from_micros(8));
671
672        let avg_time = profiler
673            .average_time("vector_add")
674            .expect("operation should succeed");
675        assert!(avg_time >= Duration::from_micros(8));
676        assert!(avg_time <= Duration::from_micros(12));
677
678        let stats = profiler
679            .get_statistics("vector_add")
680            .expect("operation should succeed");
681        assert_eq!(stats.count, 3);
682        assert_eq!(stats.min, Duration::from_micros(8));
683        assert_eq!(stats.max, Duration::from_micros(12));
684    }
685
686    #[test]
687    fn test_operation_profiler() {
688        let mut op_profiler = OperationProfiler::new("test_op".to_string());
689
690        op_profiler.record_simd_instruction();
691        op_profiler.record_simd_instruction();
692        op_profiler.record_scalar_instruction();
693        op_profiler.record_memory_load();
694
695        let (duration, counts) = op_profiler.finish();
696
697        assert!(duration >= Duration::from_nanos(0));
698        assert_eq!(counts.simd_instructions, 2);
699        assert_eq!(counts.scalar_instructions, 1);
700        assert_eq!(counts.memory_loads, 1);
701    }
702
703    #[test]
704    fn test_cache_analyzer() {
705        let analyzer = CacheAnalyzer::new();
706
707        // Small data should fit in L1 cache
708        let analysis = analyzer.analyze_access_pattern(16 * 1024, 4);
709        assert!(analysis.l1_hit_rate > 0.9);
710        assert_eq!(analysis.stride_efficiency, 1.0);
711
712        // Large stride should have poor efficiency
713        let analysis = analyzer.analyze_access_pattern(64 * 1024, 1024);
714        assert!(analysis.stride_efficiency < 0.5);
715    }
716
717    #[test]
718    fn test_vectorization_analyzer() {
719        let analysis = VectorizationAnalyzer::analyze_operation(
720            1000, // elements processed
721            8,    // SIMD width
722            120,  // actual SIMD ops (should be 125 theoretical)
723            10,   // scalar ops
724        );
725
726        assert!(analysis.vectorization_rate > 0.9); // High vectorization
727        assert!(analysis.lane_utilization > 0.95); // Good lane utilization
728        assert!(analysis.throughput_efficiency > 0.9); // Good efficiency
729    }
730
731    #[test]
732    fn test_bottleneck_analysis() {
733        let mut profiler = SimdProfiler::new();
734
735        // Set up good cache metrics so SIMD becomes the primary bottleneck
736        profiler.update_cache_metrics(CacheMetrics {
737            l1_hit_rate: 0.95, // Good cache performance
738            l2_hit_rate: 0.9,
739            l3_hit_rate: 0.85,
740            total_misses: 100,
741            bandwidth_utilization: 0.9, // Good bandwidth utilization
742        });
743
744        // Set up poor vectorization metrics
745        profiler.update_vectorization_metrics(VectorizationMetrics {
746            vectorization_rate: 0.3, // Poor vectorization
747            lane_utilization: 0.5,
748            throughput_efficiency: 0.4,
749            elements_per_operation: 2.0,
750        });
751
752        let analysis = profiler.analyze_bottlenecks();
753        assert_eq!(
754            analysis.primary_bottleneck,
755            BottleneckType::SimdUnderutilization
756        );
757        assert!(!analysis.recommendations.is_empty());
758    }
759
760    #[test]
761    fn test_global_stats() {
762        // Create some operations to test with
763        let _profiler1 = OperationProfiler::new("test_op1".to_string());
764        let mut profiler2 = OperationProfiler::new("test_op2".to_string());
765        profiler2.record_simd_instruction();
766        profiler2.record_scalar_instruction();
767
768        let stats = get_global_stats();
769        assert!(stats.total_operations >= 2); // At least the operations we just created
770
771        let simd_ratio = stats.simd_ratio();
772        assert!((0.0..=1.0).contains(&simd_ratio));
773    }
774
775    #[test]
776    fn test_performance_report_generation() {
777        let mut profiler = SimdProfiler::new();
778
779        profiler.record_time("test_operation", Duration::from_micros(100));
780        profiler.update_vectorization_metrics(VectorizationMetrics {
781            vectorization_rate: 0.85,
782            lane_utilization: 0.92,
783            throughput_efficiency: 0.88,
784            elements_per_operation: 7.5,
785        });
786
787        let report = profiler.generate_report();
788        assert!(report.contains("SIMD Performance Analysis Report"));
789        assert!(report.contains("Vectorization Rate: 85.0%"));
790        assert!(report.contains("Lane Utilization: 92.0%"));
791    }
792
793    #[test]
794    fn test_instruction_count_tracking() {
795        let count = InstructionCount {
796            simd_instructions: 100,
797            scalar_instructions: 50,
798            memory_loads: 75,
799            memory_stores: 25,
800            branches: 10,
801        };
802
803        // Verify all fields are tracked correctly
804        assert_eq!(count.simd_instructions, 100);
805        assert_eq!(count.scalar_instructions, 50);
806        assert_eq!(count.memory_loads, 75);
807        assert_eq!(count.memory_stores, 25);
808        assert_eq!(count.branches, 10);
809    }
810}