use crate::{CooTensor, CsrTensor, SparseFormat, SparseTensor, TorshResult};
use super::core::{BenchmarkConfig, PerformanceMeasurement};
use std::collections::HashMap;
use std::time::{Duration, Instant};
use torsh_core::{Shape, TorshError};
use torsh_tensor::Tensor;
pub struct SparseProfiler {
config: BenchmarkConfig,
measurements: Vec<PerformanceMeasurement>,
operation_counts: HashMap<String, usize>,
}
impl Default for SparseProfiler {
fn default() -> Self {
Self::new(BenchmarkConfig::default())
}
}
impl SparseProfiler {
pub fn new(config: BenchmarkConfig) -> Self {
Self {
config,
measurements: Vec::new(),
operation_counts: HashMap::new(),
}
}
pub fn benchmark_format_conversion(
&mut self,
sparse: &dyn SparseTensor,
target_format: SparseFormat,
) -> TorshResult<PerformanceMeasurement> {
let operation = format!("convert_{:?}_to_{:?}", sparse.format(), target_format);
for _ in 0..self.config.warmup_iterations {
let _ = self.convert_format(sparse, target_format)?;
}
let mut durations = Vec::new();
let mut memory_before = 0;
let mut memory_after = 0;
let mut peak_memory = 0;
for _ in 0..self.config.measured_iterations {
if self.config.collect_memory {
memory_before = self.estimate_memory_usage(sparse);
}
let start = Instant::now();
let result = self.convert_format(sparse, target_format)?;
let duration = start.elapsed();
if self.config.collect_memory {
memory_after = memory_before + self.estimate_memory_usage(&*result);
peak_memory = std::cmp::max(peak_memory, memory_after);
}
durations.push(duration);
if duration > self.config.max_iteration_time {
return Err(TorshError::ComputeError(format!(
"Operation exceeded maximum time limit: {duration:?}"
)));
}
}
let avg_duration = Duration::from_nanos(
(durations.iter().map(|d| d.as_nanos()).sum::<u128>() / durations.len() as u128) as u64,
);
let mut metrics = HashMap::new();
metrics.insert(
"min_time_ns".to_string(),
durations.iter().min().expect("durations should not be empty").as_nanos() as f64,
);
metrics.insert(
"max_time_ns".to_string(),
durations.iter().max().expect("durations should not be empty").as_nanos() as f64,
);
metrics.insert("std_dev_ns".to_string(), self.compute_std_dev(&durations));
metrics.insert("nnz".to_string(), sparse.nnz() as f64);
metrics.insert("sparsity".to_string(), sparse.sparsity() as f64);
let measurement = PerformanceMeasurement {
operation: operation.clone(),
duration: avg_duration,
memory_before,
memory_after,
peak_memory,
metrics,
};
self.measurements.push(measurement.clone());
*self.operation_counts.entry(operation).or_insert(0) += 1;
Ok(measurement)
}
pub fn benchmark_sparse_matmul(
&mut self,
a: &dyn SparseTensor,
b: &dyn SparseTensor,
) -> TorshResult<PerformanceMeasurement> {
let operation = format!("matmul_{:?}_{:?}", a.format(), b.format());
for _ in 0..self.config.warmup_iterations {
let _ = self.perform_matmul(a, b)?;
}
let mut durations = Vec::new();
let memory_before = if self.config.collect_memory {
self.estimate_memory_usage(a) + self.estimate_memory_usage(b)
} else {
0
};
for _ in 0..self.config.measured_iterations {
let start = Instant::now();
let _result = self.perform_matmul(a, b)?;
let duration = start.elapsed();
durations.push(duration);
}
let avg_duration = Duration::from_nanos(
(durations.iter().map(|d| d.as_nanos()).sum::<u128>() / durations.len() as u128) as u64,
);
let mut metrics = HashMap::new();
metrics.insert("a_nnz".to_string(), a.nnz() as f64);
metrics.insert("b_nnz".to_string(), b.nnz() as f64);
metrics.insert("flops_estimate".to_string(), (2 * a.nnz() * b.nnz()) as f64);
metrics.insert(
"min_time_ns".to_string(),
durations.iter().min().expect("durations should not be empty").as_nanos() as f64,
);
metrics.insert(
"max_time_ns".to_string(),
durations.iter().max().expect("durations should not be empty").as_nanos() as f64,
);
let measurement = PerformanceMeasurement {
operation: operation.clone(),
duration: avg_duration,
memory_before,
memory_after: memory_before, peak_memory: memory_before,
metrics,
};
self.measurements.push(measurement.clone());
*self.operation_counts.entry(operation).or_insert(0) += 1;
Ok(measurement)
}
pub fn benchmark_dense_to_sparse(
&mut self,
dense: &Tensor,
format: SparseFormat,
threshold: f32,
) -> TorshResult<PerformanceMeasurement> {
let operation = format!("dense_to_{format:?}");
for _ in 0..self.config.warmup_iterations {
let _ = self.convert_dense_to_sparse(dense, format, threshold)?;
}
let mut durations = Vec::new();
let memory_before = if self.config.collect_memory {
dense.shape().numel() * std::mem::size_of::<f32>()
} else {
0
};
for _ in 0..self.config.measured_iterations {
let start = Instant::now();
let _result = self.convert_dense_to_sparse(dense, format, threshold)?;
let duration = start.elapsed();
durations.push(duration);
}
let avg_duration = Duration::from_nanos(
(durations.iter().map(|d| d.as_nanos()).sum::<u128>() / durations.len() as u128) as u64,
);
let mut metrics = HashMap::new();
metrics.insert("dense_elements".to_string(), dense.numel() as f64);
metrics.insert(
"compression_ratio".to_string(),
dense.numel() as f64 / (self.count_nonzeros(dense, threshold)? as f64),
);
let measurement = PerformanceMeasurement {
operation: operation.clone(),
duration: avg_duration,
memory_before,
memory_after: memory_before, peak_memory: memory_before,
metrics,
};
self.measurements.push(measurement.clone());
Ok(measurement)
}
pub fn profile_format_comparison(
&mut self,
dense: &Tensor,
threshold: f32,
) -> TorshResult<HashMap<SparseFormat, PerformanceMeasurement>> {
let formats = vec![SparseFormat::Coo, SparseFormat::Csr, SparseFormat::Csc];
let mut results = HashMap::new();
for format in formats {
let measurement = self.benchmark_dense_to_sparse(dense, format, threshold)?;
results.insert(format, measurement);
}
Ok(results)
}
pub fn measurements(&self) -> &[PerformanceMeasurement] {
&self.measurements
}
pub fn operation_counts(&self) -> &HashMap<String, usize> {
&self.operation_counts
}
pub fn clear_measurements(&mut self) {
self.measurements.clear();
self.operation_counts.clear();
}
pub fn config(&self) -> &BenchmarkConfig {
&self.config
}
pub fn set_config(&mut self, config: BenchmarkConfig) {
self.config = config;
}
fn convert_format(
&self,
sparse: &dyn SparseTensor,
target_format: SparseFormat,
) -> TorshResult<Box<dyn SparseTensor>> {
match target_format {
SparseFormat::Coo => Ok(Box::new(sparse.to_coo()?)),
SparseFormat::Csr => Ok(Box::new(sparse.to_csr()?)),
SparseFormat::Csc => Ok(Box::new(sparse.to_csc()?)),
_ => Err(TorshError::UnsupportedOperation {
op: format!("Conversion to {target_format:?}"),
dtype: "sparse_tensor".to_string(),
}),
}
}
fn perform_matmul(
&self,
a: &dyn SparseTensor,
b: &dyn SparseTensor,
) -> TorshResult<Box<dyn SparseTensor>> {
let a_csr = a.to_csr()?;
let b_csr = b.to_csr()?;
let result_coo = self.csr_multiply(&a_csr, &b_csr)?;
Ok(Box::new(result_coo))
}
fn csr_multiply(&self, a: &CsrTensor, b: &CsrTensor) -> TorshResult<CooTensor> {
let a_coo = a.to_coo()?;
let b_coo = b.to_coo()?;
let mut result_triplets = Vec::new();
let a_triplets = a_coo.triplets();
let b_triplets = b_coo.triplets();
let mut b_col_map: HashMap<usize, Vec<(usize, f32)>> = HashMap::new();
for (row, col, val) in b_triplets {
b_col_map.entry(row).or_default().push((col, val));
}
for (a_row, a_col, a_val) in a_triplets {
if let Some(b_entries) = b_col_map.get(&a_col) {
for &(b_col, b_val) in b_entries {
result_triplets.push((a_row, b_col, a_val * b_val));
}
}
}
result_triplets.sort_by_key(|&(r, c, _)| (r, c));
let mut final_triplets = Vec::new();
let mut current_sum = 0.0;
let mut current_pos = (usize::MAX, usize::MAX);
for (r, c, v) in result_triplets {
if (r, c) == current_pos {
current_sum += v;
} else {
if current_pos != (usize::MAX, usize::MAX) && current_sum.abs() > 1e-12 {
final_triplets.push((current_pos.0, current_pos.1, current_sum));
}
current_pos = (r, c);
current_sum = v;
}
}
if current_pos != (usize::MAX, usize::MAX) && current_sum.abs() > 1e-12 {
final_triplets.push((current_pos.0, current_pos.1, current_sum));
}
let (rows, cols, vals): (Vec<_>, Vec<_>, Vec<_>) = final_triplets.into_iter().fold(
(Vec::new(), Vec::new(), Vec::new()),
|(mut rs, mut cs, mut vs), (r, c, v)| {
rs.push(r);
cs.push(c);
vs.push(v);
(rs, cs, vs)
},
);
let result_shape = Shape::new(vec![a.shape().dims()[0], b.shape().dims()[1]]);
CooTensor::new(rows, cols, vals, result_shape)
}
fn convert_dense_to_sparse(
&self,
dense: &Tensor,
format: SparseFormat,
threshold: f32,
) -> TorshResult<Box<dyn SparseTensor>> {
let coo = CooTensor::from_dense(dense, threshold)?;
self.convert_format(&coo, format)
}
fn estimate_memory_usage(&self, sparse: &dyn SparseTensor) -> usize {
let nnz = sparse.nnz();
match sparse.format() {
SparseFormat::Coo => nnz * 12, SparseFormat::Csr => nnz * 8 + sparse.shape().dims()[0] * 4,
SparseFormat::Csc => nnz * 8 + sparse.shape().dims()[1] * 4,
_ => nnz * 12,
}
}
fn count_nonzeros(&self, dense: &Tensor, threshold: f32) -> TorshResult<usize> {
let mut count = 0;
let shape = dense.shape();
for i in 0..shape.dims()[0] {
for j in 0..shape.dims()[1] {
let val = dense.get(&[i, j])?;
if val.abs() > threshold {
count += 1;
}
}
}
Ok(count)
}
fn compute_std_dev(&self, durations: &[Duration]) -> f64 {
let mean =
durations.iter().map(|d| d.as_nanos()).sum::<u128>() as f64 / durations.len() as f64;
let variance = durations
.iter()
.map(|d| {
let diff = d.as_nanos() as f64 - mean;
diff * diff
})
.sum::<f64>()
/ durations.len() as f64;
variance.sqrt()
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::CooTensor;
use torsh_core::Shape;
fn create_test_sparse_tensor() -> TorshResult<CooTensor> {
let rows = vec![0, 0, 1, 2];
let cols = vec![0, 2, 1, 2];
let vals = vec![1.0, 2.0, 3.0, 4.0];
let shape = Shape::new(vec![3, 3]);
CooTensor::new(rows, cols, vals, shape)
}
#[test]
fn test_sparse_profiler_creation() {
let config = BenchmarkConfig::fast();
let profiler = SparseProfiler::new(config);
assert_eq!(profiler.measurements().len(), 0);
assert_eq!(profiler.operation_counts().len(), 0);
assert_eq!(profiler.config().warmup_iterations, 1);
}
#[test]
fn test_format_conversion_benchmark() -> TorshResult<()> {
let mut profiler = SparseProfiler::new(BenchmarkConfig::fast());
let coo = create_test_sparse_tensor()?;
let measurement = profiler.benchmark_format_conversion(&coo, SparseFormat::Csr)?;
assert_eq!(measurement.operation, "convert_Coo_to_Csr");
assert!(measurement.duration.as_nanos() > 0);
assert!(measurement.metrics.contains_key("nnz"));
assert!(measurement.metrics.contains_key("sparsity"));
Ok(())
}
#[test]
fn test_profiler_measurements_collection() -> TorshResult<()> {
let mut profiler = SparseProfiler::new(BenchmarkConfig::fast());
let coo = create_test_sparse_tensor()?;
profiler.benchmark_format_conversion(&coo, SparseFormat::Csr)?;
profiler.benchmark_format_conversion(&coo, SparseFormat::Csc)?;
assert_eq!(profiler.measurements().len(), 2);
assert_eq!(profiler.operation_counts().len(), 2);
assert_eq!(profiler.operation_counts()["convert_Coo_to_Csr"], 1);
assert_eq!(profiler.operation_counts()["convert_Coo_to_Csc"], 1);
Ok(())
}
#[test]
fn test_clear_measurements() -> TorshResult<()> {
let mut profiler = SparseProfiler::new(BenchmarkConfig::fast());
let coo = create_test_sparse_tensor()?;
profiler.benchmark_format_conversion(&coo, SparseFormat::Csr)?;
assert_eq!(profiler.measurements().len(), 1);
profiler.clear_measurements();
assert_eq!(profiler.measurements().len(), 0);
assert_eq!(profiler.operation_counts().len(), 0);
Ok(())
}
#[test]
fn test_memory_estimation() {
let profiler = SparseProfiler::new(BenchmarkConfig::default());
let coo = create_test_sparse_tensor().expect("create test sparse tensor should succeed");
let memory_usage = profiler.estimate_memory_usage(&coo);
assert_eq!(memory_usage, 4 * 12); }
#[test]
fn test_std_dev_computation() {
let profiler = SparseProfiler::new(BenchmarkConfig::default());
let durations = vec![
Duration::from_nanos(100),
Duration::from_nanos(110),
Duration::from_nanos(90),
Duration::from_nanos(105),
];
let std_dev = profiler.compute_std_dev(&durations);
assert!(std_dev > 0.0);
assert!(std_dev < 10.0); }
}