use crate::error::Result;
use std::collections::HashMap;
use std::time::{Duration, Instant};
#[derive(Debug, Clone)]
pub struct LayerStats {
pub layer_name: String,
pub forward_ms: f64,
pub backward_ms: f64,
pub memory_bytes: usize,
pub flops: u64,
pub param_count: usize,
pub num_forward_runs: usize,
pub num_backward_runs: usize,
}
impl LayerStats {
pub fn new(layer_name: impl Into<String>) -> Self {
Self {
layer_name: layer_name.into(),
forward_ms: 0.0,
backward_ms: 0.0,
memory_bytes: 0,
flops: 0,
param_count: 0,
num_forward_runs: 0,
num_backward_runs: 0,
}
}
pub fn avg_forward_ms(&self) -> f64 {
if self.num_forward_runs == 0 {
return 0.0;
}
self.forward_ms / self.num_forward_runs as f64
}
pub fn avg_backward_ms(&self) -> f64 {
if self.num_backward_runs == 0 {
return 0.0;
}
self.backward_ms / self.num_backward_runs as f64
}
pub fn memory_mb(&self) -> f64 {
self.memory_bytes as f64 / (1024.0 * 1024.0)
}
pub fn gflops(&self) -> f64 {
self.flops as f64 / 1e9
}
}
pub struct PerformanceProfiler {
layer_stats: HashMap<String, LayerStats>,
pub warmup_runs: usize,
pub timed_runs: usize,
}
impl PerformanceProfiler {
pub fn new() -> Self {
Self {
layer_stats: HashMap::new(),
warmup_runs: 1,
timed_runs: 3,
}
}
pub fn profile_layer<T, F>(
&mut self,
layer_name: &str,
forward_fn: F,
flops: Option<u64>,
) -> Result<LayerStats>
where
F: Fn() -> (T, usize),
{
for _ in 0..self.warmup_runs {
forward_fn();
}
let mut total_ns: u128 = 0;
let mut memory_bytes = 0usize;
for _ in 0..self.timed_runs.max(1) {
let start = Instant::now();
let (_, mem) = forward_fn();
let elapsed = start.elapsed();
total_ns += elapsed.as_nanos();
memory_bytes = mem; }
let n_runs = self.timed_runs.max(1) as u128;
let avg_ns = total_ns / n_runs;
let forward_ms = Duration::from_nanos(avg_ns as u64).as_secs_f64() * 1000.0;
let stats = LayerStats {
layer_name: layer_name.to_string(),
forward_ms,
backward_ms: 0.0,
memory_bytes,
flops: flops.unwrap_or(0),
param_count: 0,
num_forward_runs: self.timed_runs.max(1),
num_backward_runs: 0,
};
self.layer_stats
.insert(layer_name.to_string(), stats.clone());
Ok(stats)
}
pub fn profile_layer_with_backward<T, F, B>(
&mut self,
layer_name: &str,
forward_fn: F,
backward_fn: B,
flops: Option<u64>,
) -> Result<LayerStats>
where
F: Fn() -> (T, usize),
B: Fn(T),
{
for _ in 0..self.warmup_runs {
let (val, _) = forward_fn();
backward_fn(val);
}
let mut fwd_ns: u128 = 0;
let mut bwd_ns: u128 = 0;
let mut memory_bytes = 0usize;
for _ in 0..self.timed_runs.max(1) {
let fwd_start = Instant::now();
let (val, mem) = forward_fn();
fwd_ns += fwd_start.elapsed().as_nanos();
memory_bytes = mem;
let bwd_start = Instant::now();
backward_fn(val);
bwd_ns += bwd_start.elapsed().as_nanos();
}
let n = self.timed_runs.max(1) as u128;
let forward_ms = Duration::from_nanos((fwd_ns / n) as u64).as_secs_f64() * 1000.0;
let backward_ms = Duration::from_nanos((bwd_ns / n) as u64).as_secs_f64() * 1000.0;
let stats = LayerStats {
layer_name: layer_name.to_string(),
forward_ms,
backward_ms,
memory_bytes,
flops: flops.unwrap_or(0),
param_count: 0,
num_forward_runs: self.timed_runs.max(1),
num_backward_runs: self.timed_runs.max(1),
};
self.layer_stats
.insert(layer_name.to_string(), stats.clone());
Ok(stats)
}
pub fn get_stats(&self, layer_name: &str) -> Option<&LayerStats> {
self.layer_stats.get(layer_name)
}
pub fn all_stats_sorted(&self) -> Vec<&LayerStats> {
let mut v: Vec<&LayerStats> = self.layer_stats.values().collect();
v.sort_by(|a, b| {
b.avg_forward_ms()
.partial_cmp(&a.avg_forward_ms())
.unwrap_or(std::cmp::Ordering::Equal)
});
v
}
pub fn reset(&mut self) {
self.layer_stats.clear();
}
pub fn report(&self) -> ProfilingReport {
let stats: Vec<LayerStats> = self.layer_stats.values().cloned().collect();
ProfilingReport::from_stats(stats)
}
}
impl Default for PerformanceProfiler {
fn default() -> Self {
Self::new()
}
}
pub struct FLOPsCounter;
impl FLOPsCounter {
pub fn dense(batch: usize, in_features: usize, out_features: usize) -> u64 {
2 * batch as u64 * in_features as u64 * out_features as u64
}
pub fn conv2d(
batch: usize,
in_channels: usize,
out_channels: usize,
kernel_h: usize,
kernel_w: usize,
out_height: usize,
out_width: usize,
) -> u64 {
2 * batch as u64
* out_channels as u64
* out_height as u64
* out_width as u64
* in_channels as u64
* kernel_h as u64
* kernel_w as u64
}
pub fn depthwise_conv2d(
batch: usize,
channels: usize,
kernel_h: usize,
kernel_w: usize,
out_height: usize,
out_width: usize,
) -> u64 {
2 * batch as u64
* channels as u64
* out_height as u64
* out_width as u64
* kernel_h as u64
* kernel_w as u64
}
pub fn pointwise_conv2d(
batch: usize,
in_channels: usize,
out_channels: usize,
height: usize,
width: usize,
) -> u64 {
2 * batch as u64 * in_channels as u64 * out_channels as u64 * height as u64 * width as u64
}
pub fn attention(
batch: usize,
num_heads: usize,
seq_len: usize,
d_key: usize,
d_val: usize,
) -> u64 {
let l = seq_len as u64;
let dk = d_key as u64;
let dv = d_val as u64;
let per_head = 2 * l * l * dk + 4 * l * l + 2 * l * l * dv; batch as u64 * num_heads as u64 * per_head
}
pub fn batch_norm(batch: usize, channels: usize, height: usize, width: usize) -> u64 {
2 * batch as u64 * channels as u64 * height as u64 * width as u64
}
pub fn layer_norm(batch: usize, seq_len: usize, hidden_dim: usize) -> u64 {
2 * batch as u64 * seq_len as u64 * hidden_dim as u64
}
pub fn embedding(_batch: usize, _seq_len: usize, _embedding_dim: usize) -> u64 {
0
}
pub fn lstm_cell(batch: usize, input_size: usize, hidden_size: usize, seq_len: usize) -> u64 {
let per_step = 8 * batch as u64 * hidden_size as u64
+ 8 * batch as u64 * input_size as u64
+ 12 * batch as u64 * hidden_size as u64; per_step * seq_len as u64
}
}
#[derive(Debug, Clone)]
pub struct ProfilingReport {
pub layers: Vec<LayerStats>,
pub total_forward_ms: f64,
pub total_backward_ms: f64,
pub total_memory_bytes: usize,
pub total_flops: u64,
pub bottleneck_layer: Option<String>,
}
impl ProfilingReport {
pub fn from_stats(layers: Vec<LayerStats>) -> Self {
let total_forward_ms: f64 = layers.iter().map(|s| s.avg_forward_ms()).sum();
let total_backward_ms: f64 = layers.iter().map(|s| s.avg_backward_ms()).sum();
let total_memory_bytes: usize = layers.iter().map(|s| s.memory_bytes).sum();
let total_flops: u64 = layers.iter().map(|s| s.flops).sum();
let bottleneck_layer = layers
.iter()
.max_by(|a, b| {
a.avg_forward_ms()
.partial_cmp(&b.avg_forward_ms())
.unwrap_or(std::cmp::Ordering::Equal)
})
.map(|s| s.layer_name.clone());
Self {
layers,
total_forward_ms,
total_backward_ms,
total_memory_bytes,
total_flops,
bottleneck_layer,
}
}
pub fn total_memory_mb(&self) -> f64 {
self.total_memory_bytes as f64 / (1024.0 * 1024.0)
}
pub fn total_gflops(&self) -> f64 {
self.total_flops as f64 / 1e9
}
pub fn layers_by_forward_time(&self) -> Vec<&LayerStats> {
let mut v: Vec<&LayerStats> = self.layers.iter().collect();
v.sort_by(|a, b| {
b.avg_forward_ms()
.partial_cmp(&a.avg_forward_ms())
.unwrap_or(std::cmp::Ordering::Equal)
});
v
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_profile_layer_basic() {
let mut profiler = PerformanceProfiler::new();
let stats = profiler
.profile_layer(
"dense_1",
|| {
let _: f64 = (0..1000).map(|i| i as f64).sum();
((), 1000 * 8)
},
Some(2_000),
)
.expect("ok");
assert_eq!(stats.layer_name, "dense_1");
assert!(stats.forward_ms >= 0.0);
assert_eq!(stats.memory_bytes, 1000 * 8);
assert_eq!(stats.flops, 2_000);
}
#[test]
fn test_profile_layer_with_backward() {
let mut profiler = PerformanceProfiler::new();
let stats = profiler
.profile_layer_with_backward(
"layer_a",
|| (vec![1.0_f64; 64], 64 * 8),
|v: Vec<f64>| {
let _: f64 = v.iter().sum();
},
None,
)
.expect("ok");
assert!(stats.backward_ms >= 0.0);
assert_eq!(stats.num_backward_runs, profiler.timed_runs.max(1));
}
#[test]
fn test_profiler_get_stats() {
let mut profiler = PerformanceProfiler::new();
profiler
.profile_layer("conv1", || ((), 512 * 64), None)
.expect("ok");
assert!(profiler.get_stats("conv1").is_some());
assert!(profiler.get_stats("nonexistent").is_none());
}
#[test]
fn test_profiler_reset() {
let mut profiler = PerformanceProfiler::new();
profiler.profile_layer("l1", || ((), 0), None).expect("ok");
assert!(!profiler.layer_stats.is_empty());
profiler.reset();
assert!(profiler.layer_stats.is_empty());
}
#[test]
fn test_profiler_report() {
let mut profiler = PerformanceProfiler::new();
profiler
.profile_layer("l1", || ((), 100), Some(1000))
.expect("ok");
profiler
.profile_layer("l2", || ((), 200), Some(2000))
.expect("ok");
let report = profiler.report();
assert_eq!(report.layers.len(), 2);
assert_eq!(report.total_flops, 3000);
assert_eq!(report.total_memory_bytes, 300);
}
#[test]
fn test_all_stats_sorted() {
let mut profiler = PerformanceProfiler::new();
profiler.warmup_runs = 0;
profiler.timed_runs = 1;
profiler
.profile_layer("fast", || ((), 0), None)
.expect("ok");
profiler
.profile_layer(
"slow",
|| {
let _: u64 = (0u64..100_000).sum();
((), 0)
},
None,
)
.expect("ok");
let sorted = profiler.all_stats_sorted();
assert_eq!(sorted.len(), 2);
}
#[test]
fn test_flops_counter_dense() {
let flops = FLOPsCounter::dense(1, 784, 512);
assert_eq!(flops, 2 * 784 * 512); }
#[test]
fn test_flops_counter_conv2d() {
let flops = FLOPsCounter::conv2d(1, 3, 64, 3, 3, 224, 224);
let expected = 2u64 * 64 * 224 * 224 * 3 * 3 * 3; assert_eq!(flops, expected);
}
#[test]
fn test_flops_counter_depthwise() {
let flops = FLOPsCounter::depthwise_conv2d(1, 32, 3, 3, 112, 112);
let expected = 2u64 * 32 * 112 * 112 * 3 * 3; assert_eq!(flops, expected);
}
#[test]
fn test_flops_counter_attention() {
let flops = FLOPsCounter::attention(1, 8, 128, 64, 64);
assert!(flops > 0);
}
#[test]
fn test_flops_counter_batch_norm() {
let flops = FLOPsCounter::batch_norm(4, 64, 56, 56);
assert_eq!(flops, 2 * 4 * 64 * 56 * 56);
}
#[test]
fn test_flops_counter_embedding_is_zero() {
assert_eq!(FLOPsCounter::embedding(2, 512, 768), 0);
}
#[test]
fn test_layer_stats_avg() {
let mut s = LayerStats::new("test");
s.forward_ms = 6.0;
s.num_forward_runs = 3;
assert!((s.avg_forward_ms() - 2.0).abs() < 1e-6);
}
#[test]
fn test_layer_stats_memory_mb() {
let mut s = LayerStats::new("x");
s.memory_bytes = 1024 * 1024;
assert!((s.memory_mb() - 1.0).abs() < 1e-6);
}
#[test]
fn test_profiling_report_bottleneck() {
let mut stats = vec![
LayerStats {
layer_name: "fast".to_string(),
forward_ms: 1.0,
num_forward_runs: 1,
..LayerStats::new("fast")
},
LayerStats {
layer_name: "slow".to_string(),
forward_ms: 10.0,
num_forward_runs: 1,
..LayerStats::new("slow")
},
];
let report = ProfilingReport::from_stats(std::mem::take(&mut stats));
assert_eq!(report.bottleneck_layer.as_deref(), Some("slow"));
}
}