use crate::TorshResult;
use std::collections::{HashMap, VecDeque};
use std::time::{Duration, Instant};
use torsh_tensor::Tensor;
#[derive(Debug, Clone)]
pub struct QuantizationProfiler {
operation_metrics: HashMap<String, OperationMetrics>,
memory_tracker: MemoryTracker,
config: ProfilerConfig,
session: ProfilerSession,
}
#[derive(Debug, Clone)]
pub struct ProfilerConfig {
pub max_samples: usize,
pub track_memory: bool,
pub detect_regressions: bool,
pub regression_threshold: f64,
pub enable_alerts: bool,
pub min_profile_time_us: u64,
}
#[derive(Debug, Clone)]
pub struct OperationMetrics {
pub name: String,
pub execution_times: VecDeque<u64>,
pub throughput: VecDeque<f64>,
pub memory_usage: VecDeque<usize>,
pub element_counts: VecDeque<usize>,
pub tensor_shapes: VecDeque<Vec<usize>>,
pub stats: PerformanceStats,
}
#[derive(Debug, Clone)]
pub struct PerformanceStats {
pub avg_time_us: f64,
pub median_time_us: f64,
pub std_dev_time_us: f64,
pub p95_time_us: f64,
pub p99_time_us: f64,
pub avg_throughput: f64,
pub peak_throughput: f64,
pub avg_memory_usage: f64,
pub sample_count: usize,
}
#[derive(Debug, Clone)]
pub struct MemoryTracker {
current_usage: HashMap<String, usize>,
peak_usage: usize,
usage_history: VecDeque<MemorySnapshot>,
}
#[derive(Debug, Clone)]
pub struct MemorySnapshot {
pub timestamp: Instant,
pub total_usage: usize,
pub breakdown: HashMap<String, usize>,
}
#[derive(Debug, Clone)]
pub struct ProfilerSession {
pub start_time: Instant,
pub total_operations: usize,
pub config_snapshot: String,
pub alerts: Vec<PerformanceAlert>,
}
#[derive(Debug, Clone)]
pub enum PerformanceAlert {
Regression {
operation: String,
previous_avg: f64,
current_avg: f64,
regression_percent: f64,
},
MemorySpike {
operation: String,
current_usage: usize,
previous_avg: usize,
spike_percent: f64,
},
SlowOperation {
operation: String,
execution_time_us: u64,
expected_time_us: u64,
},
}
#[derive(Debug, Clone)]
pub struct ProfilingResult {
pub operation: String,
pub execution_time_us: u64,
pub throughput: f64,
pub memory_usage: usize,
pub element_count: usize,
pub tensor_shape: Vec<usize>,
}
#[derive(Debug)]
pub struct PerformanceReport {
pub timestamp: Instant,
pub session_duration: Duration,
pub operation_metrics: HashMap<String, OperationMetrics>,
pub memory_analysis: MemoryAnalysis,
pub bottlenecks: Vec<Bottleneck>,
pub recommendations: Vec<String>,
pub alerts: Vec<PerformanceAlert>,
}
#[derive(Debug)]
pub struct MemoryAnalysis {
pub peak_usage: usize,
pub avg_usage: f64,
pub efficiency_score: f64,
pub trends: Vec<String>,
}
#[derive(Debug)]
pub struct Bottleneck {
pub operation: String,
pub severity: f64,
pub description: String,
pub suggestion: String,
}
impl Default for ProfilerConfig {
fn default() -> Self {
Self {
max_samples: 1000,
track_memory: true,
detect_regressions: true,
regression_threshold: 10.0, enable_alerts: true,
min_profile_time_us: 100, }
}
}
impl QuantizationProfiler {
pub fn new(config: ProfilerConfig) -> Self {
Self {
operation_metrics: HashMap::new(),
memory_tracker: MemoryTracker::new(),
config,
session: ProfilerSession {
start_time: Instant::now(),
total_operations: 0,
config_snapshot: "default".to_string(),
alerts: Vec::new(),
},
}
}
pub fn profile_operation<F, R>(
&mut self,
operation_name: &str,
tensor: &Tensor,
operation: F,
) -> TorshResult<(R, ProfilingResult)>
where
F: FnOnce(&Tensor) -> TorshResult<R>,
{
let start_memory = if self.config.track_memory {
self.estimate_memory_usage(tensor)
} else {
0
};
let start_time = Instant::now();
let result = operation(tensor)?;
let execution_time = start_time.elapsed();
let execution_time_us = execution_time.as_micros() as u64;
if execution_time_us < self.config.min_profile_time_us {
return Ok((
result,
ProfilingResult {
operation: operation_name.to_string(),
execution_time_us,
throughput: 0.0,
memory_usage: start_memory,
element_count: tensor.numel(),
tensor_shape: tensor.shape().dims().to_vec(),
},
));
}
let element_count = tensor.numel();
let throughput = if execution_time_us > 0 {
element_count as f64 / (execution_time_us as f64 / 1_000_000.0)
} else {
0.0
};
let profiling_result = ProfilingResult {
operation: operation_name.to_string(),
execution_time_us,
throughput,
memory_usage: start_memory,
element_count,
tensor_shape: tensor.shape().dims().to_vec(),
};
self.record_metrics(operation_name, &profiling_result);
if self.config.enable_alerts {
self.check_performance_alerts(operation_name, &profiling_result);
}
self.session.total_operations += 1;
Ok((result, profiling_result))
}
fn record_metrics(&mut self, operation_name: &str, result: &ProfilingResult) {
let metrics = self
.operation_metrics
.entry(operation_name.to_string())
.or_insert_with(|| OperationMetrics::new(operation_name));
metrics.execution_times.push_back(result.execution_time_us);
metrics.throughput.push_back(result.throughput);
metrics.memory_usage.push_back(result.memory_usage);
metrics.element_counts.push_back(result.element_count);
metrics.tensor_shapes.push_back(result.tensor_shape.clone());
if metrics.execution_times.len() > self.config.max_samples {
metrics.execution_times.pop_front();
metrics.throughput.pop_front();
metrics.memory_usage.pop_front();
metrics.element_counts.pop_front();
metrics.tensor_shapes.pop_front();
}
metrics.update_statistics();
if self.config.track_memory {
self.memory_tracker
.record_usage(operation_name, result.memory_usage);
}
}
fn check_performance_alerts(&mut self, operation_name: &str, result: &ProfilingResult) {
if let Some(metrics) = self.operation_metrics.get(operation_name) {
if self.config.detect_regressions && metrics.execution_times.len() > 10 {
let recent_avg =
metrics.execution_times.iter().rev().take(5).sum::<u64>() as f64 / 5.0;
let historical_avg = metrics.stats.avg_time_us;
if recent_avg > historical_avg * (1.0 + self.config.regression_threshold / 100.0) {
let regression_percent =
((recent_avg - historical_avg) / historical_avg) * 100.0;
self.session.alerts.push(PerformanceAlert::Regression {
operation: operation_name.to_string(),
previous_avg: historical_avg,
current_avg: recent_avg,
regression_percent,
});
}
}
if result.memory_usage > metrics.stats.avg_memory_usage as usize * 2 {
let spike_percent = ((result.memory_usage as f64 - metrics.stats.avg_memory_usage)
/ metrics.stats.avg_memory_usage)
* 100.0;
self.session.alerts.push(PerformanceAlert::MemorySpike {
operation: operation_name.to_string(),
current_usage: result.memory_usage,
previous_avg: metrics.stats.avg_memory_usage as usize,
spike_percent,
});
}
if result.execution_time_us > metrics.stats.p95_time_us as u64 * 2 {
self.session.alerts.push(PerformanceAlert::SlowOperation {
operation: operation_name.to_string(),
execution_time_us: result.execution_time_us,
expected_time_us: metrics.stats.avg_time_us as u64,
});
}
}
}
fn estimate_memory_usage(&self, tensor: &Tensor) -> usize {
let element_size = match tensor.dtype() {
torsh_core::DType::F32 => 4,
torsh_core::DType::F64 => 8,
torsh_core::DType::I32 => 4,
torsh_core::DType::I64 => 8,
_ => 4, };
tensor.numel() * element_size
}
pub fn generate_report(&self) -> PerformanceReport {
let session_duration = self.session.start_time.elapsed();
let memory_analysis = self.analyze_memory_usage();
let bottlenecks = self.identify_bottlenecks();
let recommendations = self.generate_recommendations(&bottlenecks);
PerformanceReport {
timestamp: Instant::now(),
session_duration,
operation_metrics: self.operation_metrics.clone(),
memory_analysis,
bottlenecks,
recommendations,
alerts: self.session.alerts.clone(),
}
}
fn analyze_memory_usage(&self) -> MemoryAnalysis {
let total_usage: usize = self.memory_tracker.current_usage.values().sum();
let avg_usage = if self.memory_tracker.usage_history.is_empty() {
total_usage as f64
} else {
self.memory_tracker
.usage_history
.iter()
.map(|s| s.total_usage as f64)
.sum::<f64>()
/ self.memory_tracker.usage_history.len() as f64
};
let efficiency_score = if self.memory_tracker.peak_usage > 0 {
((self.memory_tracker.peak_usage as f64 - avg_usage)
/ self.memory_tracker.peak_usage as f64)
* 100.0
} else {
100.0
};
MemoryAnalysis {
peak_usage: self.memory_tracker.peak_usage,
avg_usage,
efficiency_score,
trends: vec!["Memory usage is stable".to_string()], }
}
fn identify_bottlenecks(&self) -> Vec<Bottleneck> {
let mut bottlenecks = Vec::new();
for (op_name, metrics) in &self.operation_metrics {
if metrics.stats.std_dev_time_us > metrics.stats.avg_time_us * 0.5 {
bottlenecks.push(Bottleneck {
operation: op_name.clone(),
severity: 75.0,
description: "High variance in execution times detected".to_string(),
suggestion: "Consider optimizing for consistent performance".to_string(),
});
}
if metrics.stats.avg_throughput < 1_000_000.0 {
bottlenecks.push(Bottleneck {
operation: op_name.clone(),
severity: 60.0,
description: "Low throughput detected".to_string(),
suggestion: "Consider hardware acceleration or algorithm optimization"
.to_string(),
});
}
}
bottlenecks
}
fn generate_recommendations(&self, bottlenecks: &[Bottleneck]) -> Vec<String> {
let mut recommendations = Vec::new();
if !bottlenecks.is_empty() {
recommendations
.push("Consider enabling hardware acceleration for better performance".to_string());
}
if self.session.total_operations > 1000 {
recommendations.push(
"Large number of operations detected - consider batch processing".to_string(),
);
}
if self.memory_tracker.peak_usage > 1_000_000_000 {
recommendations.push(
"High memory usage detected - consider streaming or chunked processing".to_string(),
);
}
if recommendations.is_empty() {
recommendations
.push("Performance looks optimal - no specific recommendations".to_string());
}
recommendations
}
pub fn get_session_stats(&self) -> HashMap<String, f64> {
let mut stats = HashMap::new();
stats.insert(
"total_operations".to_string(),
self.session.total_operations as f64,
);
stats.insert(
"session_duration_seconds".to_string(),
self.session.start_time.elapsed().as_secs_f64(),
);
stats.insert(
"operations_per_second".to_string(),
self.session.total_operations as f64 / self.session.start_time.elapsed().as_secs_f64(),
);
stats.insert(
"alerts_generated".to_string(),
self.session.alerts.len() as f64,
);
if let Some(fastest_op) = self.operation_metrics.values().min_by(|a, b| {
a.stats
.avg_time_us
.partial_cmp(&b.stats.avg_time_us)
.unwrap()
}) {
stats.insert(
"fastest_avg_time_us".to_string(),
fastest_op.stats.avg_time_us,
);
}
if let Some(slowest_op) = self.operation_metrics.values().max_by(|a, b| {
a.stats
.avg_time_us
.partial_cmp(&b.stats.avg_time_us)
.unwrap()
}) {
stats.insert(
"slowest_avg_time_us".to_string(),
slowest_op.stats.avg_time_us,
);
}
stats
}
pub fn reset(&mut self) {
self.operation_metrics.clear();
self.memory_tracker = MemoryTracker::new();
self.session = ProfilerSession {
start_time: Instant::now(),
total_operations: 0,
config_snapshot: format!("{:?}", self.config),
alerts: Vec::new(),
};
}
}
impl OperationMetrics {
fn new(name: &str) -> Self {
Self {
name: name.to_string(),
execution_times: VecDeque::new(),
throughput: VecDeque::new(),
memory_usage: VecDeque::new(),
element_counts: VecDeque::new(),
tensor_shapes: VecDeque::new(),
stats: PerformanceStats::default(),
}
}
fn update_statistics(&mut self) {
if self.execution_times.is_empty() {
return;
}
let times: Vec<f64> = self.execution_times.iter().map(|&t| t as f64).collect();
let throughputs: Vec<f64> = self.throughput.iter().copied().collect();
let memory_usages: Vec<f64> = self.memory_usage.iter().map(|&m| m as f64).collect();
let avg_time = times.iter().sum::<f64>() / times.len() as f64;
let mut sorted_times = times.clone();
sorted_times.sort_by(|a, b| a.partial_cmp(b).unwrap());
let median_time = sorted_times[sorted_times.len() / 2];
let mean = avg_time;
let variance = times.iter().map(|t| (t - mean).powi(2)).sum::<f64>() / times.len() as f64;
let std_dev_time = variance.sqrt();
let p95_time = Self::percentile(×, 95.0);
let p99_time = Self::percentile(×, 99.0);
self.stats = PerformanceStats {
avg_time_us: avg_time,
median_time_us: median_time,
std_dev_time_us: std_dev_time,
p95_time_us: p95_time,
p99_time_us: p99_time,
avg_throughput: throughputs.iter().sum::<f64>() / throughputs.len() as f64,
peak_throughput: throughputs.iter().copied().fold(0.0, f64::max),
avg_memory_usage: memory_usages.iter().sum::<f64>() / memory_usages.len() as f64,
sample_count: times.len(),
};
}
fn percentile(values: &[f64], percentile: f64) -> f64 {
if values.is_empty() {
return 0.0;
}
let mut sorted = values.to_vec();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
let index = (percentile / 100.0) * (sorted.len() - 1) as f64;
let lower = index.floor() as usize;
let upper = index.ceil() as usize;
if lower == upper {
sorted[lower]
} else {
let weight = index - lower as f64;
sorted[lower] * (1.0 - weight) + sorted[upper] * weight
}
}
}
impl Default for PerformanceStats {
fn default() -> Self {
Self {
avg_time_us: 0.0,
median_time_us: 0.0,
std_dev_time_us: 0.0,
p95_time_us: 0.0,
p99_time_us: 0.0,
avg_throughput: 0.0,
peak_throughput: 0.0,
avg_memory_usage: 0.0,
sample_count: 0,
}
}
}
impl MemoryTracker {
fn new() -> Self {
Self {
current_usage: HashMap::new(),
peak_usage: 0,
usage_history: VecDeque::new(),
}
}
fn record_usage(&mut self, operation: &str, usage: usize) {
self.current_usage.insert(operation.to_string(), usage);
let total_usage: usize = self.current_usage.values().sum();
if total_usage > self.peak_usage {
self.peak_usage = total_usage;
}
self.usage_history.push_back(MemorySnapshot {
timestamp: Instant::now(),
total_usage,
breakdown: self.current_usage.clone(),
});
if self.usage_history.len() > 1000 {
self.usage_history.pop_front();
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use torsh_tensor::creation::tensor_1d;
#[test]
fn test_profiler_creation() {
let config = ProfilerConfig::default();
let profiler = QuantizationProfiler::new(config);
assert_eq!(profiler.operation_metrics.len(), 0);
assert_eq!(profiler.session.total_operations, 0);
}
#[test]
fn test_profile_operation() {
let mut profiler = QuantizationProfiler::new(ProfilerConfig::default());
let tensor = tensor_1d(&[1.0, 2.0, 3.0, 4.0]).unwrap();
let result = profiler.profile_operation("test_op", &tensor, |t| {
std::thread::sleep(std::time::Duration::from_micros(200));
Ok(t.clone())
});
assert!(result.is_ok());
let (_, profiling_result) = result.unwrap();
assert_eq!(profiling_result.operation, "test_op");
assert!(profiling_result.execution_time_us >= 200);
assert_eq!(profiling_result.element_count, 4);
}
#[test]
fn test_performance_stats_calculation() {
let mut metrics = OperationMetrics::new("test");
metrics.execution_times.extend([100, 200, 150, 180, 120]);
metrics
.throughput
.extend([1000.0, 2000.0, 1500.0, 1800.0, 1200.0]);
metrics.memory_usage.extend([1000, 2000, 1500, 1800, 1200]);
metrics.update_statistics();
assert_eq!(metrics.stats.avg_time_us, 150.0);
assert_eq!(metrics.stats.sample_count, 5);
assert!(metrics.stats.std_dev_time_us > 0.0);
}
#[test]
fn test_memory_tracking() {
let mut tracker = MemoryTracker::new();
tracker.record_usage("op1", 1000);
tracker.record_usage("op2", 2000);
assert_eq!(tracker.peak_usage, 3000);
assert_eq!(tracker.usage_history.len(), 2);
}
#[test]
fn test_bottleneck_detection() {
let mut profiler = QuantizationProfiler::new(ProfilerConfig::default());
let metrics = OperationMetrics {
name: "slow_op".to_string(),
execution_times: VecDeque::from([100, 500, 200, 600, 150]),
throughput: VecDeque::from([500.0, 100.0, 400.0, 90.0, 450.0]),
memory_usage: VecDeque::new(),
element_counts: VecDeque::new(),
tensor_shapes: VecDeque::new(),
stats: PerformanceStats {
avg_time_us: 310.0,
std_dev_time_us: 220.0, avg_throughput: 308.0, ..PerformanceStats::default()
},
};
profiler
.operation_metrics
.insert("slow_op".to_string(), metrics);
let bottlenecks = profiler.identify_bottlenecks();
assert!(!bottlenecks.is_empty());
}
#[test]
fn test_session_stats() {
let mut profiler = QuantizationProfiler::new(ProfilerConfig::default());
profiler.session.total_operations = 100;
let stats = profiler.get_session_stats();
assert_eq!(stats.get("total_operations"), Some(&100.0));
assert!(stats.contains_key("session_duration_seconds"));
}
#[test]
fn test_profiler_reset() {
let mut profiler = QuantizationProfiler::new(ProfilerConfig::default());
profiler.session.total_operations = 100;
profiler
.operation_metrics
.insert("test".to_string(), OperationMetrics::new("test"));
profiler.reset();
assert_eq!(profiler.session.total_operations, 0);
assert!(profiler.operation_metrics.is_empty());
}
#[test]
fn test_percentile_calculation() {
let values = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
assert_eq!(OperationMetrics::percentile(&values, 50.0), 5.5);
assert!((OperationMetrics::percentile(&values, 95.0) - 9.55).abs() < 0.01);
assert_eq!(OperationMetrics::percentile(&values, 0.0), 1.0);
assert_eq!(OperationMetrics::percentile(&values, 100.0), 10.0);
}
#[test]
fn test_performance_alert_regression() {
let mut profiler = QuantizationProfiler::new(ProfilerConfig {
regression_threshold: 20.0,
..ProfilerConfig::default()
});
let mut metrics = OperationMetrics::new("test_op");
metrics
.execution_times
.extend([100, 105, 95, 110, 100, 98, 102, 108, 96, 104, 101, 99]);
metrics.execution_times.extend([150, 145, 155, 160, 148]); metrics.update_statistics();
profiler
.operation_metrics
.insert("test_op".to_string(), metrics);
let slow_result = ProfilingResult {
operation: "test_op".to_string(),
execution_time_us: 150, throughput: 1000.0,
memory_usage: 1000,
element_count: 100,
tensor_shape: vec![100],
};
profiler.check_performance_alerts("test_op", &slow_result);
assert!(!profiler.session.alerts.is_empty());
match &profiler.session.alerts[0] {
PerformanceAlert::Regression {
regression_percent, ..
} => {
assert!(*regression_percent > 20.0);
}
_ => {
assert!(
false,
"Expected regression alert, but got different alert type"
);
}
}
}
}