1#[cfg(not(feature = "no-std"))]
7use std::{
8 collections::HashMap,
9 string::ToString,
10 sync::atomic::{AtomicU64, Ordering},
11 time::{Duration, Instant},
12};
13
14#[cfg(feature = "no-std")]
15use alloc::{
16 collections::BTreeMap as HashMap,
17 format,
18 string::{String, ToString},
19 vec,
20 vec::Vec,
21};
22#[cfg(feature = "no-std")]
23use core::sync::atomic::{AtomicU64, Ordering};
24
25#[cfg(feature = "no-std")]
27#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
28pub struct Duration(u64); #[cfg(feature = "no-std")]
30#[derive(Debug, Clone, Copy)]
31pub struct Instant; #[cfg(feature = "no-std")]
34impl Instant {
35 pub fn now() -> Self {
36 Instant }
38
39 pub fn elapsed(&self) -> Duration {
40 Duration(0) }
42}
43
44#[cfg(feature = "no-std")]
45impl Duration {
46 pub fn as_nanos(&self) -> u128 {
47 self.0 as u128 * 1000 }
49
50 pub fn from_nanos(nanos: u64) -> Self {
51 Duration(nanos / 1000) }
53
54 pub fn from_micros(micros: u64) -> Self {
55 Duration(micros) }
57
58 pub fn as_micros(&self) -> u128 {
59 self.0 as u128 }
61}
62
63#[cfg(feature = "no-std")]
64impl core::ops::Add for Duration {
65 type Output = Duration;
66
67 fn add(self, rhs: Duration) -> Self::Output {
68 Duration(self.0 + rhs.0)
69 }
70}
71
72#[cfg(feature = "no-std")]
73impl core::ops::Div<u32> for Duration {
74 type Output = Duration;
75
76 fn div(self, rhs: u32) -> Self::Output {
77 Duration(self.0 / rhs as u64)
78 }
79}
80
81#[cfg(feature = "no-std")]
82impl core::iter::Sum for Duration {
83 fn sum<I: Iterator<Item = Duration>>(iter: I) -> Self {
84 Duration(iter.map(|d| d.0).sum())
85 }
86}
87
88#[cfg(feature = "no-std")]
89impl<'a> core::iter::Sum<&'a Duration> for Duration {
90 fn sum<I: Iterator<Item = &'a Duration>>(iter: I) -> Self {
91 Duration(iter.map(|d| d.0).sum())
92 }
93}
94
95#[derive(Debug, Clone)]
97pub struct SimdProfiler {
98 operation_times: HashMap<String, Vec<Duration>>,
100 instruction_counts: HashMap<String, InstructionCount>,
102 cache_metrics: CacheMetrics,
104 vectorization_metrics: VectorizationMetrics,
106}
107
108#[derive(Debug, Clone, Default)]
110pub struct InstructionCount {
111 pub simd_instructions: u64,
113 pub scalar_instructions: u64,
115 pub memory_loads: u64,
117 pub memory_stores: u64,
119 pub branches: u64,
121}
122
123#[derive(Debug, Clone, Default)]
125pub struct CacheMetrics {
126 pub l1_hit_rate: f64,
128 pub l2_hit_rate: f64,
130 pub l3_hit_rate: f64,
132 pub total_misses: u64,
134 pub bandwidth_utilization: f64,
136}
137
138#[derive(Debug, Clone, Default)]
140pub struct VectorizationMetrics {
141 pub vectorization_rate: f64,
143 pub lane_utilization: f64,
145 pub throughput_efficiency: f64,
147 pub elements_per_operation: f64,
149}
150
151#[derive(Debug, Clone)]
153pub struct BottleneckAnalysis {
154 pub primary_bottleneck: BottleneckType,
156 pub limiters: Vec<(BottleneckType, f64)>,
158 pub recommendations: Vec<String>,
160}
161
162#[derive(Debug, Clone, PartialEq)]
164pub enum BottleneckType {
165 Compute,
167 MemoryBandwidth,
169 MemoryLatency,
171 CacheMiss,
173 BranchPrediction,
175 SimdUnderutilization,
177 InstructionDependency,
179}
180
181static GLOBAL_OPERATION_COUNT: AtomicU64 = AtomicU64::new(0);
183static GLOBAL_SIMD_COUNT: AtomicU64 = AtomicU64::new(0);
184static GLOBAL_SCALAR_COUNT: AtomicU64 = AtomicU64::new(0);
185
186impl Default for SimdProfiler {
187 fn default() -> Self {
188 Self::new()
189 }
190}
191
192impl SimdProfiler {
193 pub fn new() -> Self {
195 Self {
196 operation_times: HashMap::new(),
197 instruction_counts: HashMap::new(),
198 cache_metrics: CacheMetrics::default(),
199 vectorization_metrics: VectorizationMetrics::default(),
200 }
201 }
202
203 pub fn start_operation(&mut self, operation_name: &str) -> OperationProfiler {
205 OperationProfiler::new(operation_name.to_string())
206 }
207
208 pub fn record_time(&mut self, operation: &str, duration: Duration) {
210 self.operation_times
211 .entry(operation.to_string())
212 .or_default()
213 .push(duration);
214 }
215
216 pub fn record_instructions(&mut self, operation: &str, counts: InstructionCount) {
218 self.instruction_counts
219 .insert(operation.to_string(), counts);
220 }
221
222 pub fn update_cache_metrics(&mut self, metrics: CacheMetrics) {
224 self.cache_metrics = metrics;
225 }
226
227 pub fn update_vectorization_metrics(&mut self, metrics: VectorizationMetrics) {
229 self.vectorization_metrics = metrics;
230 }
231
232 pub fn average_time(&self, operation: &str) -> Option<Duration> {
234 self.operation_times.get(operation).map(|times| {
235 let total: Duration = times.iter().sum();
236 total / times.len() as u32
237 })
238 }
239
240 pub fn get_statistics(&self, operation: &str) -> Option<OperationStats> {
242 self.operation_times.get(operation).map(|times| {
243 let count = times.len();
244 let total: Duration = times.iter().sum();
245 let average = total / count as u32;
246
247 let mut sorted_times = times.clone();
248 sorted_times.sort();
249
250 let median = if count % 2 == 0 {
251 (sorted_times[count / 2 - 1] + sorted_times[count / 2]) / 2
252 } else {
253 sorted_times[count / 2]
254 };
255
256 let min = *sorted_times
257 .first()
258 .expect("collection should not be empty");
259 let max = *sorted_times.last().expect("collection should not be empty");
260
261 OperationStats {
262 count,
263 total,
264 average,
265 median,
266 min,
267 max,
268 std_deviation: self.calculate_std_deviation(times, average),
269 }
270 })
271 }
272
273 fn calculate_std_deviation(&self, times: &[Duration], average: Duration) -> Duration {
275 if times.len() <= 1 {
276 return Duration::from_nanos(0);
277 }
278
279 let variance: f64 = times
280 .iter()
281 .map(|&time| {
282 let diff = time.as_nanos() as f64 - average.as_nanos() as f64;
283 diff * diff
284 })
285 .sum::<f64>()
286 / times.len() as f64;
287
288 Duration::from_nanos(variance.sqrt() as u64)
289 }
290
291 pub fn analyze_bottlenecks(&self) -> BottleneckAnalysis {
293 let mut limiters = Vec::new();
294
295 if self.vectorization_metrics.vectorization_rate < 0.7 {
297 limiters.push((
298 BottleneckType::SimdUnderutilization,
299 1.0 - self.vectorization_metrics.vectorization_rate,
300 ));
301 }
302
303 if self.cache_metrics.l1_hit_rate < 0.9 {
305 limiters.push((
306 BottleneckType::CacheMiss,
307 1.0 - self.cache_metrics.l1_hit_rate,
308 ));
309 }
310
311 if self.cache_metrics.bandwidth_utilization < 0.8 {
313 limiters.push((
314 BottleneckType::MemoryBandwidth,
315 1.0 - self.cache_metrics.bandwidth_utilization,
316 ));
317 }
318
319 limiters.sort_by(|a, b| b.1.partial_cmp(&a.1).expect("operation should succeed"));
321
322 let primary_bottleneck = limiters
323 .first()
324 .map(|(t, _)| t.clone())
325 .unwrap_or(BottleneckType::Compute);
326
327 let recommendations = self.generate_recommendations(&limiters);
328
329 BottleneckAnalysis {
330 primary_bottleneck,
331 limiters,
332 recommendations,
333 }
334 }
335
336 fn generate_recommendations(&self, limiters: &[(BottleneckType, f64)]) -> Vec<String> {
338 let mut recommendations = Vec::new();
339
340 for (bottleneck_type, impact) in limiters {
341 match bottleneck_type {
342 BottleneckType::SimdUnderutilization => {
343 recommendations.push(format!(
344 "Improve SIMD utilization (current: {:.1}%): Consider wider SIMD instructions or better data layout",
345 self.vectorization_metrics.vectorization_rate * 100.0
346 ));
347 }
348 BottleneckType::CacheMiss => {
349 recommendations.push(format!(
350 "Reduce cache misses (impact: {:.1}%): Improve data locality or use cache-friendly algorithms",
351 impact * 100.0
352 ));
353 }
354 BottleneckType::MemoryBandwidth => {
355 recommendations.push(format!(
356 "Optimize memory bandwidth (utilization: {:.1}%): Use prefetching or reduce memory traffic",
357 self.cache_metrics.bandwidth_utilization * 100.0
358 ));
359 }
360 BottleneckType::BranchPrediction => {
361 recommendations.push(
362 "Reduce branching: Use branchless algorithms or improve predictability"
363 .to_string(),
364 );
365 }
366 _ => {}
367 }
368 }
369
370 recommendations
371 }
372
373 pub fn generate_report(&self) -> String {
375 let mut report = String::new();
376 report.push_str("=== SIMD Performance Analysis Report ===\n\n");
377
378 report.push_str("## Operation Performance Summary\n");
380 for operation in self.operation_times.keys() {
381 if let Some(stats) = self.get_statistics(operation) {
382 report.push_str(&format!(
383 "{}: avg={:.2}μs, min={:.2}μs, max={:.2}μs, count={}\n",
384 operation,
385 stats.average.as_micros(),
386 stats.min.as_micros(),
387 stats.max.as_micros(),
388 stats.count
389 ));
390 }
391 }
392
393 report.push_str(&format!(
395 "\n## Vectorization Efficiency\n\
396 Vectorization Rate: {:.1}%\n\
397 Lane Utilization: {:.1}%\n\
398 Throughput Efficiency: {:.1}%\n",
399 self.vectorization_metrics.vectorization_rate * 100.0,
400 self.vectorization_metrics.lane_utilization * 100.0,
401 self.vectorization_metrics.throughput_efficiency * 100.0
402 ));
403
404 report.push_str(&format!(
406 "\n## Cache Performance\n\
407 L1 Hit Rate: {:.1}%\n\
408 L2 Hit Rate: {:.1}%\n\
409 L3 Hit Rate: {:.1}%\n\
410 Bandwidth Utilization: {:.1}%\n",
411 self.cache_metrics.l1_hit_rate * 100.0,
412 self.cache_metrics.l2_hit_rate * 100.0,
413 self.cache_metrics.l3_hit_rate * 100.0,
414 self.cache_metrics.bandwidth_utilization * 100.0
415 ));
416
417 let analysis = self.analyze_bottlenecks();
419 report.push_str(&format!(
420 "\n## Bottleneck Analysis\n\
421 Primary Bottleneck: {:?}\n",
422 analysis.primary_bottleneck
423 ));
424
425 report.push_str("\n## Optimization Recommendations\n");
426 for (i, recommendation) in analysis.recommendations.iter().enumerate() {
427 report.push_str(&format!("{}. {}\n", i + 1, recommendation));
428 }
429
430 report
431 }
432}
433
434#[derive(Debug, Clone)]
436pub struct OperationStats {
437 pub count: usize,
438 pub total: Duration,
439 pub average: Duration,
440 pub median: Duration,
441 pub min: Duration,
442 pub max: Duration,
443 pub std_deviation: Duration,
444}
445
446pub struct OperationProfiler {
448 #[allow(dead_code)] operation_name: String,
450 start_time: Instant,
451 instruction_count: InstructionCount,
452}
453
454impl OperationProfiler {
455 pub fn new(operation_name: String) -> Self {
457 GLOBAL_OPERATION_COUNT.fetch_add(1, Ordering::Relaxed);
458
459 Self {
460 operation_name,
461 start_time: Instant::now(),
462 instruction_count: InstructionCount::default(),
463 }
464 }
465
466 pub fn record_simd_instruction(&mut self) {
468 self.instruction_count.simd_instructions += 1;
469 GLOBAL_SIMD_COUNT.fetch_add(1, Ordering::Relaxed);
470 }
471
472 pub fn record_scalar_instruction(&mut self) {
474 self.instruction_count.scalar_instructions += 1;
475 GLOBAL_SCALAR_COUNT.fetch_add(1, Ordering::Relaxed);
476 }
477
478 pub fn record_memory_load(&mut self) {
480 self.instruction_count.memory_loads += 1;
481 }
482
483 pub fn record_memory_store(&mut self) {
484 self.instruction_count.memory_stores += 1;
485 }
486
487 pub fn finish(self) -> (Duration, InstructionCount) {
489 let duration = self.start_time.elapsed();
490 (duration, self.instruction_count)
491 }
492}
493
494pub struct CacheAnalyzer {
496 cache_sizes: Vec<usize>,
498 cache_line_size: usize,
500}
501
502impl Default for CacheAnalyzer {
503 fn default() -> Self {
504 Self::new()
505 }
506}
507
508impl CacheAnalyzer {
509 pub fn new() -> Self {
511 Self {
512 cache_sizes: vec![32 * 1024, 256 * 1024, 8 * 1024 * 1024], cache_line_size: 64,
514 }
515 }
516
517 pub fn analyze_access_pattern(&self, data_size: usize, stride: usize) -> CacheAnalysis {
519 let cache_lines_accessed = data_size.div_ceil(self.cache_line_size);
520
521 let l1_working_set = cache_lines_accessed * self.cache_line_size;
523 let l1_fit = l1_working_set <= self.cache_sizes[0];
524 let l2_fit = l1_working_set <= self.cache_sizes[1];
525 let l3_fit = l1_working_set <= self.cache_sizes[2];
526
527 let estimated_l1_hit_rate = if l1_fit { 0.95 } else { 0.1 };
528 let estimated_l2_hit_rate = if l2_fit { 0.9 } else { 0.3 };
529 let estimated_l3_hit_rate = if l3_fit { 0.8 } else { 0.1 };
530
531 CacheAnalysis {
532 l1_hit_rate: estimated_l1_hit_rate,
533 l2_hit_rate: estimated_l2_hit_rate,
534 l3_hit_rate: estimated_l3_hit_rate,
535 cache_lines_accessed,
536 working_set_size: l1_working_set,
537 stride_efficiency: self.calculate_stride_efficiency(stride),
538 }
539 }
540
541 fn calculate_stride_efficiency(&self, stride: usize) -> f64 {
543 if stride <= self.cache_line_size {
544 1.0 } else if stride <= self.cache_line_size * 2 {
546 0.8 } else if stride <= self.cache_line_size * 4 {
548 0.6 } else {
550 0.3 }
552 }
553}
554
555#[derive(Debug, Clone)]
557pub struct CacheAnalysis {
558 pub l1_hit_rate: f64,
559 pub l2_hit_rate: f64,
560 pub l3_hit_rate: f64,
561 pub cache_lines_accessed: usize,
562 pub working_set_size: usize,
563 pub stride_efficiency: f64,
564}
565
566pub struct VectorizationAnalyzer;
568
569impl VectorizationAnalyzer {
570 pub fn analyze_operation(
572 elements_processed: usize,
573 simd_width: usize,
574 actual_simd_ops: usize,
575 scalar_ops: usize,
576 ) -> VectorizationAnalysis {
577 let theoretical_simd_ops = elements_processed.div_ceil(simd_width);
578 let total_ops = actual_simd_ops + scalar_ops;
579
580 let vectorization_rate = if total_ops > 0 {
581 actual_simd_ops as f64 / total_ops as f64
582 } else {
583 0.0
584 };
585
586 let lane_utilization = if actual_simd_ops > 0 {
587 elements_processed as f64 / (actual_simd_ops * simd_width) as f64
588 } else {
589 0.0
590 };
591
592 let throughput_efficiency = if theoretical_simd_ops > 0 {
593 actual_simd_ops as f64 / theoretical_simd_ops as f64
594 } else {
595 0.0
596 };
597
598 VectorizationAnalysis {
599 vectorization_rate,
600 lane_utilization,
601 throughput_efficiency,
602 theoretical_simd_ops,
603 actual_simd_ops,
604 scalar_fallback_ops: scalar_ops,
605 }
606 }
607}
608
609#[derive(Debug, Clone)]
611pub struct VectorizationAnalysis {
612 pub vectorization_rate: f64,
613 pub lane_utilization: f64,
614 pub throughput_efficiency: f64,
615 pub theoretical_simd_ops: usize,
616 pub actual_simd_ops: usize,
617 pub scalar_fallback_ops: usize,
618}
619
620pub fn get_global_stats() -> GlobalStats {
622 GlobalStats {
623 total_operations: GLOBAL_OPERATION_COUNT.load(Ordering::Relaxed),
624 total_simd_instructions: GLOBAL_SIMD_COUNT.load(Ordering::Relaxed),
625 total_scalar_instructions: GLOBAL_SCALAR_COUNT.load(Ordering::Relaxed),
626 }
627}
628
629#[derive(Debug, Clone)]
631pub struct GlobalStats {
632 pub total_operations: u64,
633 pub total_simd_instructions: u64,
634 pub total_scalar_instructions: u64,
635}
636
637impl GlobalStats {
638 pub fn simd_ratio(&self) -> f64 {
640 let total = self.total_simd_instructions + self.total_scalar_instructions;
641 if total > 0 {
642 self.total_simd_instructions as f64 / total as f64
643 } else {
644 0.0
645 }
646 }
647}
648
649#[allow(non_snake_case)]
650#[cfg(all(test, not(feature = "no-std")))]
651mod tests {
652 use super::*;
653 #[cfg(not(feature = "no-std"))]
654 use std::time::Duration;
655
656 #[cfg(feature = "no-std")]
657 use alloc::{
658 string::{String, ToString},
659 vec,
660 vec::Vec,
661 };
662
663 #[test]
664 fn test_profiler_basic_functionality() {
665 let mut profiler = SimdProfiler::new();
666
667 profiler.record_time("vector_add", Duration::from_micros(10));
669 profiler.record_time("vector_add", Duration::from_micros(12));
670 profiler.record_time("vector_add", Duration::from_micros(8));
671
672 let avg_time = profiler
673 .average_time("vector_add")
674 .expect("operation should succeed");
675 assert!(avg_time >= Duration::from_micros(8));
676 assert!(avg_time <= Duration::from_micros(12));
677
678 let stats = profiler
679 .get_statistics("vector_add")
680 .expect("operation should succeed");
681 assert_eq!(stats.count, 3);
682 assert_eq!(stats.min, Duration::from_micros(8));
683 assert_eq!(stats.max, Duration::from_micros(12));
684 }
685
686 #[test]
687 fn test_operation_profiler() {
688 let mut op_profiler = OperationProfiler::new("test_op".to_string());
689
690 op_profiler.record_simd_instruction();
691 op_profiler.record_simd_instruction();
692 op_profiler.record_scalar_instruction();
693 op_profiler.record_memory_load();
694
695 let (duration, counts) = op_profiler.finish();
696
697 assert!(duration >= Duration::from_nanos(0));
698 assert_eq!(counts.simd_instructions, 2);
699 assert_eq!(counts.scalar_instructions, 1);
700 assert_eq!(counts.memory_loads, 1);
701 }
702
703 #[test]
704 fn test_cache_analyzer() {
705 let analyzer = CacheAnalyzer::new();
706
707 let analysis = analyzer.analyze_access_pattern(16 * 1024, 4);
709 assert!(analysis.l1_hit_rate > 0.9);
710 assert_eq!(analysis.stride_efficiency, 1.0);
711
712 let analysis = analyzer.analyze_access_pattern(64 * 1024, 1024);
714 assert!(analysis.stride_efficiency < 0.5);
715 }
716
717 #[test]
718 fn test_vectorization_analyzer() {
719 let analysis = VectorizationAnalyzer::analyze_operation(
720 1000, 8, 120, 10, );
725
726 assert!(analysis.vectorization_rate > 0.9); assert!(analysis.lane_utilization > 0.95); assert!(analysis.throughput_efficiency > 0.9); }
730
731 #[test]
732 fn test_bottleneck_analysis() {
733 let mut profiler = SimdProfiler::new();
734
735 profiler.update_cache_metrics(CacheMetrics {
737 l1_hit_rate: 0.95, l2_hit_rate: 0.9,
739 l3_hit_rate: 0.85,
740 total_misses: 100,
741 bandwidth_utilization: 0.9, });
743
744 profiler.update_vectorization_metrics(VectorizationMetrics {
746 vectorization_rate: 0.3, lane_utilization: 0.5,
748 throughput_efficiency: 0.4,
749 elements_per_operation: 2.0,
750 });
751
752 let analysis = profiler.analyze_bottlenecks();
753 assert_eq!(
754 analysis.primary_bottleneck,
755 BottleneckType::SimdUnderutilization
756 );
757 assert!(!analysis.recommendations.is_empty());
758 }
759
760 #[test]
761 fn test_global_stats() {
762 let _profiler1 = OperationProfiler::new("test_op1".to_string());
764 let mut profiler2 = OperationProfiler::new("test_op2".to_string());
765 profiler2.record_simd_instruction();
766 profiler2.record_scalar_instruction();
767
768 let stats = get_global_stats();
769 assert!(stats.total_operations >= 2); let simd_ratio = stats.simd_ratio();
772 assert!((0.0..=1.0).contains(&simd_ratio));
773 }
774
775 #[test]
776 fn test_performance_report_generation() {
777 let mut profiler = SimdProfiler::new();
778
779 profiler.record_time("test_operation", Duration::from_micros(100));
780 profiler.update_vectorization_metrics(VectorizationMetrics {
781 vectorization_rate: 0.85,
782 lane_utilization: 0.92,
783 throughput_efficiency: 0.88,
784 elements_per_operation: 7.5,
785 });
786
787 let report = profiler.generate_report();
788 assert!(report.contains("SIMD Performance Analysis Report"));
789 assert!(report.contains("Vectorization Rate: 85.0%"));
790 assert!(report.contains("Lane Utilization: 92.0%"));
791 }
792
793 #[test]
794 fn test_instruction_count_tracking() {
795 let count = InstructionCount {
796 simd_instructions: 100,
797 scalar_instructions: 50,
798 memory_loads: 75,
799 memory_stores: 25,
800 branches: 10,
801 };
802
803 assert_eq!(count.simd_instructions, 100);
805 assert_eq!(count.scalar_instructions, 50);
806 assert_eq!(count.memory_loads, 75);
807 assert_eq!(count.memory_stores, 25);
808 assert_eq!(count.branches, 10);
809 }
810}