#[cfg(not(feature = "no-std"))]
use std::{
collections::HashMap,
string::ToString,
sync::atomic::{AtomicU64, Ordering},
time::{Duration, Instant},
};
#[cfg(feature = "no-std")]
use alloc::{
collections::BTreeMap as HashMap,
format,
string::{String, ToString},
vec,
vec::Vec,
};
#[cfg(feature = "no-std")]
use core::sync::atomic::{AtomicU64, Ordering};
#[cfg(feature = "no-std")]
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub struct Duration(u64); #[cfg(feature = "no-std")]
#[derive(Debug, Clone, Copy)]
pub struct Instant;
#[cfg(feature = "no-std")]
impl Instant {
pub fn now() -> Self {
Instant }
pub fn elapsed(&self) -> Duration {
Duration(0) }
}
#[cfg(feature = "no-std")]
impl Duration {
pub fn as_nanos(&self) -> u128 {
self.0 as u128 * 1000 }
pub fn from_nanos(nanos: u64) -> Self {
Duration(nanos / 1000) }
pub fn from_micros(micros: u64) -> Self {
Duration(micros) }
pub fn as_micros(&self) -> u128 {
self.0 as u128 }
}
#[cfg(feature = "no-std")]
impl core::ops::Add for Duration {
type Output = Duration;
fn add(self, rhs: Duration) -> Self::Output {
Duration(self.0 + rhs.0)
}
}
#[cfg(feature = "no-std")]
impl core::ops::Div<u32> for Duration {
type Output = Duration;
fn div(self, rhs: u32) -> Self::Output {
Duration(self.0 / rhs as u64)
}
}
#[cfg(feature = "no-std")]
impl core::iter::Sum for Duration {
fn sum<I: Iterator<Item = Duration>>(iter: I) -> Self {
Duration(iter.map(|d| d.0).sum())
}
}
#[cfg(feature = "no-std")]
impl<'a> core::iter::Sum<&'a Duration> for Duration {
fn sum<I: Iterator<Item = &'a Duration>>(iter: I) -> Self {
Duration(iter.map(|d| d.0).sum())
}
}
#[derive(Debug, Clone)]
pub struct SimdProfiler {
operation_times: HashMap<String, Vec<Duration>>,
instruction_counts: HashMap<String, InstructionCount>,
cache_metrics: CacheMetrics,
vectorization_metrics: VectorizationMetrics,
}
#[derive(Debug, Clone, Default)]
pub struct InstructionCount {
pub simd_instructions: u64,
pub scalar_instructions: u64,
pub memory_loads: u64,
pub memory_stores: u64,
pub branches: u64,
}
#[derive(Debug, Clone, Default)]
pub struct CacheMetrics {
pub l1_hit_rate: f64,
pub l2_hit_rate: f64,
pub l3_hit_rate: f64,
pub total_misses: u64,
pub bandwidth_utilization: f64,
}
#[derive(Debug, Clone, Default)]
pub struct VectorizationMetrics {
pub vectorization_rate: f64,
pub lane_utilization: f64,
pub throughput_efficiency: f64,
pub elements_per_operation: f64,
}
#[derive(Debug, Clone)]
pub struct BottleneckAnalysis {
pub primary_bottleneck: BottleneckType,
pub limiters: Vec<(BottleneckType, f64)>,
pub recommendations: Vec<String>,
}
#[derive(Debug, Clone, PartialEq)]
pub enum BottleneckType {
Compute,
MemoryBandwidth,
MemoryLatency,
CacheMiss,
BranchPrediction,
SimdUnderutilization,
InstructionDependency,
}
static GLOBAL_OPERATION_COUNT: AtomicU64 = AtomicU64::new(0);
static GLOBAL_SIMD_COUNT: AtomicU64 = AtomicU64::new(0);
static GLOBAL_SCALAR_COUNT: AtomicU64 = AtomicU64::new(0);
impl Default for SimdProfiler {
fn default() -> Self {
Self::new()
}
}
impl SimdProfiler {
pub fn new() -> Self {
Self {
operation_times: HashMap::new(),
instruction_counts: HashMap::new(),
cache_metrics: CacheMetrics::default(),
vectorization_metrics: VectorizationMetrics::default(),
}
}
pub fn start_operation(&mut self, operation_name: &str) -> OperationProfiler {
OperationProfiler::new(operation_name.to_string())
}
pub fn record_time(&mut self, operation: &str, duration: Duration) {
self.operation_times
.entry(operation.to_string())
.or_default()
.push(duration);
}
pub fn record_instructions(&mut self, operation: &str, counts: InstructionCount) {
self.instruction_counts
.insert(operation.to_string(), counts);
}
pub fn update_cache_metrics(&mut self, metrics: CacheMetrics) {
self.cache_metrics = metrics;
}
pub fn update_vectorization_metrics(&mut self, metrics: VectorizationMetrics) {
self.vectorization_metrics = metrics;
}
pub fn average_time(&self, operation: &str) -> Option<Duration> {
self.operation_times.get(operation).map(|times| {
let total: Duration = times.iter().sum();
total / times.len() as u32
})
}
pub fn get_statistics(&self, operation: &str) -> Option<OperationStats> {
self.operation_times.get(operation).map(|times| {
let count = times.len();
let total: Duration = times.iter().sum();
let average = total / count as u32;
let mut sorted_times = times.clone();
sorted_times.sort();
let median = if count % 2 == 0 {
(sorted_times[count / 2 - 1] + sorted_times[count / 2]) / 2
} else {
sorted_times[count / 2]
};
let min = *sorted_times
.first()
.expect("collection should not be empty");
let max = *sorted_times.last().expect("collection should not be empty");
OperationStats {
count,
total,
average,
median,
min,
max,
std_deviation: self.calculate_std_deviation(times, average),
}
})
}
fn calculate_std_deviation(&self, times: &[Duration], average: Duration) -> Duration {
if times.len() <= 1 {
return Duration::from_nanos(0);
}
let variance: f64 = times
.iter()
.map(|&time| {
let diff = time.as_nanos() as f64 - average.as_nanos() as f64;
diff * diff
})
.sum::<f64>()
/ times.len() as f64;
Duration::from_nanos(variance.sqrt() as u64)
}
pub fn analyze_bottlenecks(&self) -> BottleneckAnalysis {
let mut limiters = Vec::new();
if self.vectorization_metrics.vectorization_rate < 0.7 {
limiters.push((
BottleneckType::SimdUnderutilization,
1.0 - self.vectorization_metrics.vectorization_rate,
));
}
if self.cache_metrics.l1_hit_rate < 0.9 {
limiters.push((
BottleneckType::CacheMiss,
1.0 - self.cache_metrics.l1_hit_rate,
));
}
if self.cache_metrics.bandwidth_utilization < 0.8 {
limiters.push((
BottleneckType::MemoryBandwidth,
1.0 - self.cache_metrics.bandwidth_utilization,
));
}
limiters.sort_by(|a, b| b.1.partial_cmp(&a.1).expect("operation should succeed"));
let primary_bottleneck = limiters
.first()
.map(|(t, _)| t.clone())
.unwrap_or(BottleneckType::Compute);
let recommendations = self.generate_recommendations(&limiters);
BottleneckAnalysis {
primary_bottleneck,
limiters,
recommendations,
}
}
fn generate_recommendations(&self, limiters: &[(BottleneckType, f64)]) -> Vec<String> {
let mut recommendations = Vec::new();
for (bottleneck_type, impact) in limiters {
match bottleneck_type {
BottleneckType::SimdUnderutilization => {
recommendations.push(format!(
"Improve SIMD utilization (current: {:.1}%): Consider wider SIMD instructions or better data layout",
self.vectorization_metrics.vectorization_rate * 100.0
));
}
BottleneckType::CacheMiss => {
recommendations.push(format!(
"Reduce cache misses (impact: {:.1}%): Improve data locality or use cache-friendly algorithms",
impact * 100.0
));
}
BottleneckType::MemoryBandwidth => {
recommendations.push(format!(
"Optimize memory bandwidth (utilization: {:.1}%): Use prefetching or reduce memory traffic",
self.cache_metrics.bandwidth_utilization * 100.0
));
}
BottleneckType::BranchPrediction => {
recommendations.push(
"Reduce branching: Use branchless algorithms or improve predictability"
.to_string(),
);
}
_ => {}
}
}
recommendations
}
pub fn generate_report(&self) -> String {
let mut report = String::new();
report.push_str("=== SIMD Performance Analysis Report ===\n\n");
report.push_str("## Operation Performance Summary\n");
for operation in self.operation_times.keys() {
if let Some(stats) = self.get_statistics(operation) {
report.push_str(&format!(
"{}: avg={:.2}μs, min={:.2}μs, max={:.2}μs, count={}\n",
operation,
stats.average.as_micros(),
stats.min.as_micros(),
stats.max.as_micros(),
stats.count
));
}
}
report.push_str(&format!(
"\n## Vectorization Efficiency\n\
Vectorization Rate: {:.1}%\n\
Lane Utilization: {:.1}%\n\
Throughput Efficiency: {:.1}%\n",
self.vectorization_metrics.vectorization_rate * 100.0,
self.vectorization_metrics.lane_utilization * 100.0,
self.vectorization_metrics.throughput_efficiency * 100.0
));
report.push_str(&format!(
"\n## Cache Performance\n\
L1 Hit Rate: {:.1}%\n\
L2 Hit Rate: {:.1}%\n\
L3 Hit Rate: {:.1}%\n\
Bandwidth Utilization: {:.1}%\n",
self.cache_metrics.l1_hit_rate * 100.0,
self.cache_metrics.l2_hit_rate * 100.0,
self.cache_metrics.l3_hit_rate * 100.0,
self.cache_metrics.bandwidth_utilization * 100.0
));
let analysis = self.analyze_bottlenecks();
report.push_str(&format!(
"\n## Bottleneck Analysis\n\
Primary Bottleneck: {:?}\n",
analysis.primary_bottleneck
));
report.push_str("\n## Optimization Recommendations\n");
for (i, recommendation) in analysis.recommendations.iter().enumerate() {
report.push_str(&format!("{}. {}\n", i + 1, recommendation));
}
report
}
}
#[derive(Debug, Clone)]
pub struct OperationStats {
pub count: usize,
pub total: Duration,
pub average: Duration,
pub median: Duration,
pub min: Duration,
pub max: Duration,
pub std_deviation: Duration,
}
pub struct OperationProfiler {
#[allow(dead_code)] operation_name: String,
start_time: Instant,
instruction_count: InstructionCount,
}
impl OperationProfiler {
pub fn new(operation_name: String) -> Self {
GLOBAL_OPERATION_COUNT.fetch_add(1, Ordering::Relaxed);
Self {
operation_name,
start_time: Instant::now(),
instruction_count: InstructionCount::default(),
}
}
pub fn record_simd_instruction(&mut self) {
self.instruction_count.simd_instructions += 1;
GLOBAL_SIMD_COUNT.fetch_add(1, Ordering::Relaxed);
}
pub fn record_scalar_instruction(&mut self) {
self.instruction_count.scalar_instructions += 1;
GLOBAL_SCALAR_COUNT.fetch_add(1, Ordering::Relaxed);
}
pub fn record_memory_load(&mut self) {
self.instruction_count.memory_loads += 1;
}
pub fn record_memory_store(&mut self) {
self.instruction_count.memory_stores += 1;
}
pub fn finish(self) -> (Duration, InstructionCount) {
let duration = self.start_time.elapsed();
(duration, self.instruction_count)
}
}
pub struct CacheAnalyzer {
cache_sizes: Vec<usize>,
cache_line_size: usize,
}
impl Default for CacheAnalyzer {
fn default() -> Self {
Self::new()
}
}
impl CacheAnalyzer {
pub fn new() -> Self {
Self {
cache_sizes: vec![32 * 1024, 256 * 1024, 8 * 1024 * 1024], cache_line_size: 64,
}
}
pub fn analyze_access_pattern(&self, data_size: usize, stride: usize) -> CacheAnalysis {
let cache_lines_accessed = data_size.div_ceil(self.cache_line_size);
let l1_working_set = cache_lines_accessed * self.cache_line_size;
let l1_fit = l1_working_set <= self.cache_sizes[0];
let l2_fit = l1_working_set <= self.cache_sizes[1];
let l3_fit = l1_working_set <= self.cache_sizes[2];
let estimated_l1_hit_rate = if l1_fit { 0.95 } else { 0.1 };
let estimated_l2_hit_rate = if l2_fit { 0.9 } else { 0.3 };
let estimated_l3_hit_rate = if l3_fit { 0.8 } else { 0.1 };
CacheAnalysis {
l1_hit_rate: estimated_l1_hit_rate,
l2_hit_rate: estimated_l2_hit_rate,
l3_hit_rate: estimated_l3_hit_rate,
cache_lines_accessed,
working_set_size: l1_working_set,
stride_efficiency: self.calculate_stride_efficiency(stride),
}
}
fn calculate_stride_efficiency(&self, stride: usize) -> f64 {
if stride <= self.cache_line_size {
1.0 } else if stride <= self.cache_line_size * 2 {
0.8 } else if stride <= self.cache_line_size * 4 {
0.6 } else {
0.3 }
}
}
#[derive(Debug, Clone)]
pub struct CacheAnalysis {
pub l1_hit_rate: f64,
pub l2_hit_rate: f64,
pub l3_hit_rate: f64,
pub cache_lines_accessed: usize,
pub working_set_size: usize,
pub stride_efficiency: f64,
}
pub struct VectorizationAnalyzer;
impl VectorizationAnalyzer {
pub fn analyze_operation(
elements_processed: usize,
simd_width: usize,
actual_simd_ops: usize,
scalar_ops: usize,
) -> VectorizationAnalysis {
let theoretical_simd_ops = elements_processed.div_ceil(simd_width);
let total_ops = actual_simd_ops + scalar_ops;
let vectorization_rate = if total_ops > 0 {
actual_simd_ops as f64 / total_ops as f64
} else {
0.0
};
let lane_utilization = if actual_simd_ops > 0 {
elements_processed as f64 / (actual_simd_ops * simd_width) as f64
} else {
0.0
};
let throughput_efficiency = if theoretical_simd_ops > 0 {
actual_simd_ops as f64 / theoretical_simd_ops as f64
} else {
0.0
};
VectorizationAnalysis {
vectorization_rate,
lane_utilization,
throughput_efficiency,
theoretical_simd_ops,
actual_simd_ops,
scalar_fallback_ops: scalar_ops,
}
}
}
#[derive(Debug, Clone)]
pub struct VectorizationAnalysis {
pub vectorization_rate: f64,
pub lane_utilization: f64,
pub throughput_efficiency: f64,
pub theoretical_simd_ops: usize,
pub actual_simd_ops: usize,
pub scalar_fallback_ops: usize,
}
pub fn get_global_stats() -> GlobalStats {
GlobalStats {
total_operations: GLOBAL_OPERATION_COUNT.load(Ordering::Relaxed),
total_simd_instructions: GLOBAL_SIMD_COUNT.load(Ordering::Relaxed),
total_scalar_instructions: GLOBAL_SCALAR_COUNT.load(Ordering::Relaxed),
}
}
#[derive(Debug, Clone)]
pub struct GlobalStats {
pub total_operations: u64,
pub total_simd_instructions: u64,
pub total_scalar_instructions: u64,
}
impl GlobalStats {
pub fn simd_ratio(&self) -> f64 {
let total = self.total_simd_instructions + self.total_scalar_instructions;
if total > 0 {
self.total_simd_instructions as f64 / total as f64
} else {
0.0
}
}
}
#[allow(non_snake_case)]
#[cfg(all(test, not(feature = "no-std")))]
mod tests {
use super::*;
#[cfg(not(feature = "no-std"))]
use std::time::Duration;
#[cfg(feature = "no-std")]
use alloc::{
string::{String, ToString},
vec,
vec::Vec,
};
#[test]
fn test_profiler_basic_functionality() {
let mut profiler = SimdProfiler::new();
profiler.record_time("vector_add", Duration::from_micros(10));
profiler.record_time("vector_add", Duration::from_micros(12));
profiler.record_time("vector_add", Duration::from_micros(8));
let avg_time = profiler
.average_time("vector_add")
.expect("operation should succeed");
assert!(avg_time >= Duration::from_micros(8));
assert!(avg_time <= Duration::from_micros(12));
let stats = profiler
.get_statistics("vector_add")
.expect("operation should succeed");
assert_eq!(stats.count, 3);
assert_eq!(stats.min, Duration::from_micros(8));
assert_eq!(stats.max, Duration::from_micros(12));
}
#[test]
fn test_operation_profiler() {
let mut op_profiler = OperationProfiler::new("test_op".to_string());
op_profiler.record_simd_instruction();
op_profiler.record_simd_instruction();
op_profiler.record_scalar_instruction();
op_profiler.record_memory_load();
let (duration, counts) = op_profiler.finish();
assert!(duration >= Duration::from_nanos(0));
assert_eq!(counts.simd_instructions, 2);
assert_eq!(counts.scalar_instructions, 1);
assert_eq!(counts.memory_loads, 1);
}
#[test]
fn test_cache_analyzer() {
let analyzer = CacheAnalyzer::new();
let analysis = analyzer.analyze_access_pattern(16 * 1024, 4);
assert!(analysis.l1_hit_rate > 0.9);
assert_eq!(analysis.stride_efficiency, 1.0);
let analysis = analyzer.analyze_access_pattern(64 * 1024, 1024);
assert!(analysis.stride_efficiency < 0.5);
}
#[test]
fn test_vectorization_analyzer() {
let analysis = VectorizationAnalyzer::analyze_operation(
1000, 8, 120, 10, );
assert!(analysis.vectorization_rate > 0.9); assert!(analysis.lane_utilization > 0.95); assert!(analysis.throughput_efficiency > 0.9); }
#[test]
fn test_bottleneck_analysis() {
let mut profiler = SimdProfiler::new();
profiler.update_cache_metrics(CacheMetrics {
l1_hit_rate: 0.95, l2_hit_rate: 0.9,
l3_hit_rate: 0.85,
total_misses: 100,
bandwidth_utilization: 0.9, });
profiler.update_vectorization_metrics(VectorizationMetrics {
vectorization_rate: 0.3, lane_utilization: 0.5,
throughput_efficiency: 0.4,
elements_per_operation: 2.0,
});
let analysis = profiler.analyze_bottlenecks();
assert_eq!(
analysis.primary_bottleneck,
BottleneckType::SimdUnderutilization
);
assert!(!analysis.recommendations.is_empty());
}
#[test]
fn test_global_stats() {
let _profiler1 = OperationProfiler::new("test_op1".to_string());
let mut profiler2 = OperationProfiler::new("test_op2".to_string());
profiler2.record_simd_instruction();
profiler2.record_scalar_instruction();
let stats = get_global_stats();
assert!(stats.total_operations >= 2);
let simd_ratio = stats.simd_ratio();
assert!((0.0..=1.0).contains(&simd_ratio));
}
#[test]
fn test_performance_report_generation() {
let mut profiler = SimdProfiler::new();
profiler.record_time("test_operation", Duration::from_micros(100));
profiler.update_vectorization_metrics(VectorizationMetrics {
vectorization_rate: 0.85,
lane_utilization: 0.92,
throughput_efficiency: 0.88,
elements_per_operation: 7.5,
});
let report = profiler.generate_report();
assert!(report.contains("SIMD Performance Analysis Report"));
assert!(report.contains("Vectorization Rate: 85.0%"));
assert!(report.contains("Lane Utilization: 92.0%"));
}
#[test]
fn test_instruction_count_tracking() {
let count = InstructionCount {
simd_instructions: 100,
scalar_instructions: 50,
memory_loads: 75,
memory_stores: 25,
branches: 10,
};
assert_eq!(count.simd_instructions, 100);
assert_eq!(count.scalar_instructions, 50);
assert_eq!(count.memory_loads, 75);
assert_eq!(count.memory_stores, 25);
assert_eq!(count.branches, 10);
}
}