use anyhow::Result;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum Precision {
Fp32,
Fp16,
Tf32,
Int8,
Bf16,
}
impl std::fmt::Display for Precision {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Precision::Fp32 => write!(f, "FP32"),
Precision::Fp16 => write!(f, "FP16 Tensor"),
Precision::Tf32 => write!(f, "TF32 Tensor"),
Precision::Int8 => write!(f, "INT8 Tensor"),
Precision::Bf16 => write!(f, "BF16"),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum MemoryLevel {
L1Cache,
L2Cache,
Dram,
Pcie,
}
impl std::fmt::Display for MemoryLevel {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
MemoryLevel::L1Cache => write!(f, "L1 Cache"),
MemoryLevel::L2Cache => write!(f, "L2 Cache"),
MemoryLevel::Dram => write!(f, "DRAM"),
MemoryLevel::Pcie => write!(f, "PCIe"),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum Bound {
Memory { bandwidth_utilization: f64 },
Compute { compute_utilization: f64 },
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RooflineModel {
pub target: String,
pub peak_compute: HashMap<Precision, f64>,
pub peak_bandwidth: HashMap<MemoryLevel, f64>,
}
impl RooflineModel {
pub fn ridge_point(&self, precision: Precision, mem_level: MemoryLevel) -> Option<f64> {
let compute = self.peak_compute.get(&precision)?;
let bandwidth = self.peak_bandwidth.get(&mem_level)?;
if *bandwidth <= 0.0 {
return None;
}
Some(compute / bandwidth)
}
pub fn theoretical_peak(
&self,
arithmetic_intensity: f64,
precision: Precision,
mem_level: MemoryLevel,
) -> Option<f64> {
let compute = self.peak_compute.get(&precision)?;
let bandwidth = self.peak_bandwidth.get(&mem_level)?;
Some(compute.min(arithmetic_intensity * bandwidth))
}
pub fn classify(
&self,
arithmetic_intensity: f64,
achieved_throughput: f64,
precision: Precision,
mem_level: MemoryLevel,
) -> Option<KernelRooflinePoint> {
let ridge = self.ridge_point(precision, mem_level)?;
let peak = self.theoretical_peak(arithmetic_intensity, precision, mem_level)?;
let peak_compute = *self.peak_compute.get(&precision)?;
let bound = if arithmetic_intensity < ridge {
Bound::Memory {
bandwidth_utilization: achieved_throughput / peak * 100.0,
}
} else {
Bound::Compute {
compute_utilization: achieved_throughput / peak_compute * 100.0,
}
};
let efficiency = if peak > 0.0 {
achieved_throughput / peak * 100.0
} else {
0.0
};
let distance_to_ridge = if arithmetic_intensity > 0.0 {
ridge / arithmetic_intensity
} else {
f64::INFINITY
};
Some(KernelRooflinePoint {
arithmetic_intensity,
achieved_throughput,
peak_throughput: peak,
efficiency,
bound,
distance_to_ridge,
})
}
pub fn rtx_4090() -> Self {
let mut peak_compute = HashMap::new();
peak_compute.insert(Precision::Fp32, 82.6e12); peak_compute.insert(Precision::Fp16, 330.0e12); peak_compute.insert(Precision::Tf32, 165.0e12); peak_compute.insert(Precision::Int8, 660.0e12);
let mut peak_bandwidth = HashMap::new();
peak_bandwidth.insert(MemoryLevel::L1Cache, 19.0e12); peak_bandwidth.insert(MemoryLevel::L2Cache, 5.3e12); peak_bandwidth.insert(MemoryLevel::Dram, 1008.0e9); peak_bandwidth.insert(MemoryLevel::Pcie, 32.0e9);
RooflineModel {
target: "NVIDIA GeForce RTX 4090 (SM 8.9)".to_string(),
peak_compute,
peak_bandwidth,
}
}
pub fn cpu_avx2(freq_ghz: f64, cores: usize, mem_bandwidth_gbps: f64) -> Self {
let fp32_peak = 2.0 * 8.0 * 2.0 * freq_ghz * 1e9 * cores as f64;
let mut peak_compute = HashMap::new();
peak_compute.insert(Precision::Fp32, fp32_peak);
let mut peak_bandwidth = HashMap::new();
peak_bandwidth.insert(MemoryLevel::Dram, mem_bandwidth_gbps * 1e9);
RooflineModel {
target: format!("CPU AVX2+FMA ({cores} cores @ {freq_ghz} GHz)"),
peak_compute,
peak_bandwidth,
}
}
pub fn cpu_avx512(freq_ghz: f64, cores: usize, mem_bandwidth_gbps: f64) -> Self {
let fp32_peak = 2.0 * 16.0 * 2.0 * freq_ghz * 1e9 * cores as f64;
let mut peak_compute = HashMap::new();
peak_compute.insert(Precision::Fp32, fp32_peak);
let mut peak_bandwidth = HashMap::new();
peak_bandwidth.insert(MemoryLevel::Dram, mem_bandwidth_gbps * 1e9);
RooflineModel {
target: format!("CPU AVX-512+FMA ({cores} cores @ {freq_ghz} GHz)"),
peak_compute,
peak_bandwidth,
}
}
pub fn cpu_neon(freq_ghz: f64, cores: usize, mem_bandwidth_gbps: f64) -> Self {
let fp32_peak = 2.0 * 4.0 * 2.0 * freq_ghz * 1e9 * cores as f64;
let mut peak_compute = HashMap::new();
peak_compute.insert(Precision::Fp32, fp32_peak);
let mut peak_bandwidth = HashMap::new();
peak_bandwidth.insert(MemoryLevel::Dram, mem_bandwidth_gbps * 1e9);
RooflineModel {
target: format!("CPU NEON ({cores} cores @ {freq_ghz} GHz)"),
peak_compute,
peak_bandwidth,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KernelRooflinePoint {
pub arithmetic_intensity: f64,
pub achieved_throughput: f64,
pub peak_throughput: f64,
pub efficiency: f64,
pub bound: Bound,
pub distance_to_ridge: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EmpiricalResult {
pub measured_bandwidth_bps: f64,
pub measured_peak_flops: f64,
pub measured_ridge_point: f64,
pub bandwidth_efficiency: f64,
pub compute_efficiency: f64,
}
fn measure_bandwidth() -> f64 {
const N: usize = 16 * 1024 * 1024; const ITERS: usize = 10;
let a: Vec<f32> = vec![1.0f32; N];
let mut b: Vec<f32> = vec![0.0f32; N];
b.copy_from_slice(&a);
let start = std::time::Instant::now();
for _ in 0..ITERS {
b.copy_from_slice(&a);
std::hint::black_box(&b);
}
let elapsed = start.elapsed().as_secs_f64();
let bytes = 2.0 * N as f64 * 4.0 * ITERS as f64;
bytes / elapsed
}
fn measure_bandwidth_triad() -> f64 {
const N: usize = 16 * 1024 * 1024; const ITERS: usize = 10;
let b: Vec<f32> = vec![1.0f32; N];
let c: Vec<f32> = vec![2.0f32; N];
let mut a: Vec<f32> = vec![0.0f32; N];
let s = 3.0f32;
for i in 0..N {
a[i] = b[i] + s * c[i];
}
let start = std::time::Instant::now();
for _ in 0..ITERS {
for i in 0..N {
a[i] = b[i] + s * c[i];
}
std::hint::black_box(&a);
}
let elapsed = start.elapsed().as_secs_f64();
let bytes = 3.0 * N as f64 * 4.0 * ITERS as f64;
bytes / elapsed
}
fn measure_peak_flops_single_core() -> f64 {
#[cfg(target_arch = "x86_64")]
{
if std::arch::is_x86_feature_detected!("avx512f") {
return unsafe { measure_peak_flops_avx512() };
}
if std::arch::is_x86_feature_detected!("avx2") && std::arch::is_x86_feature_detected!("fma")
{
return unsafe { measure_peak_flops_avx2() };
}
}
measure_peak_flops_scalar()
}
fn measure_peak_flops_scalar() -> f64 {
const ITERS: u64 = 500_000_000;
let mut a0 = 1.0f32;
let mut a1 = 1.1f32;
let mut a2 = 1.2f32;
let mut a3 = 1.3f32;
let m = 1.0000001f32;
let add = 0.0000001f32;
let start = std::time::Instant::now();
for _ in 0..ITERS {
a0 = a0.mul_add(m, add);
a1 = a1.mul_add(m, add);
a2 = a2.mul_add(m, add);
a3 = a3.mul_add(m, add);
}
let elapsed = start.elapsed().as_secs_f64();
std::hint::black_box(a0 + a1 + a2 + a3);
ITERS as f64 * 8.0 / elapsed
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,fma")]
unsafe fn measure_peak_flops_avx2() -> f64 {
use std::arch::x86_64::*;
const ITERS: u64 = 100_000_000;
let mut v0 = _mm256_set1_ps(1.0);
let mut v1 = _mm256_set1_ps(1.1);
let mut v2 = _mm256_set1_ps(1.2);
let mut v3 = _mm256_set1_ps(1.3);
let mut v4 = _mm256_set1_ps(1.4);
let mut v5 = _mm256_set1_ps(1.5);
let mut v6 = _mm256_set1_ps(1.6);
let mut v7 = _mm256_set1_ps(1.7);
let mut v8 = _mm256_set1_ps(1.8);
let mut v9 = _mm256_set1_ps(1.9);
let mul = _mm256_set1_ps(1.0000001);
let add = _mm256_set1_ps(0.0000001);
let start = std::time::Instant::now();
for _ in 0..ITERS {
v0 = _mm256_fmadd_ps(v0, mul, add);
v1 = _mm256_fmadd_ps(v1, mul, add);
v2 = _mm256_fmadd_ps(v2, mul, add);
v3 = _mm256_fmadd_ps(v3, mul, add);
v4 = _mm256_fmadd_ps(v4, mul, add);
v5 = _mm256_fmadd_ps(v5, mul, add);
v6 = _mm256_fmadd_ps(v6, mul, add);
v7 = _mm256_fmadd_ps(v7, mul, add);
v8 = _mm256_fmadd_ps(v8, mul, add);
v9 = _mm256_fmadd_ps(v9, mul, add);
}
let elapsed = start.elapsed().as_secs_f64();
let sum = _mm256_add_ps(v0, v1);
let sum = _mm256_add_ps(sum, v2);
let sum = _mm256_add_ps(sum, v3);
let sum = _mm256_add_ps(sum, v4);
let sum = _mm256_add_ps(sum, v5);
let sum = _mm256_add_ps(sum, v6);
let sum = _mm256_add_ps(sum, v7);
let sum = _mm256_add_ps(sum, v8);
let sum = _mm256_add_ps(sum, v9);
std::hint::black_box(sum);
ITERS as f64 * 160.0 / elapsed
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx512f")]
unsafe fn measure_peak_flops_avx512() -> f64 {
use std::arch::x86_64::*;
const ITERS: u64 = 100_000_000;
let mut v0 = _mm512_set1_ps(1.0);
let mut v1 = _mm512_set1_ps(1.1);
let mut v2 = _mm512_set1_ps(1.2);
let mut v3 = _mm512_set1_ps(1.3);
let mut v4 = _mm512_set1_ps(1.4);
let mut v5 = _mm512_set1_ps(1.5);
let mut v6 = _mm512_set1_ps(1.6);
let mut v7 = _mm512_set1_ps(1.7);
let mut v8 = _mm512_set1_ps(1.8);
let mut v9 = _mm512_set1_ps(1.9);
let mul = _mm512_set1_ps(1.0000001);
let add = _mm512_set1_ps(0.0000001);
let start = std::time::Instant::now();
for _ in 0..ITERS {
v0 = _mm512_fmadd_ps(v0, mul, add);
v1 = _mm512_fmadd_ps(v1, mul, add);
v2 = _mm512_fmadd_ps(v2, mul, add);
v3 = _mm512_fmadd_ps(v3, mul, add);
v4 = _mm512_fmadd_ps(v4, mul, add);
v5 = _mm512_fmadd_ps(v5, mul, add);
v6 = _mm512_fmadd_ps(v6, mul, add);
v7 = _mm512_fmadd_ps(v7, mul, add);
v8 = _mm512_fmadd_ps(v8, mul, add);
v9 = _mm512_fmadd_ps(v9, mul, add);
}
let elapsed = start.elapsed().as_secs_f64();
let sum = _mm512_add_ps(v0, v1);
let sum = _mm512_add_ps(sum, v2);
let sum = _mm512_add_ps(sum, v3);
let sum = _mm512_add_ps(sum, v4);
let sum = _mm512_add_ps(sum, v5);
let sum = _mm512_add_ps(sum, v6);
let sum = _mm512_add_ps(sum, v7);
let sum = _mm512_add_ps(sum, v8);
let sum = _mm512_add_ps(sum, v9);
std::hint::black_box(sum);
ITERS as f64 * 320.0 / elapsed
}
pub fn measure_empirical(theoretical: &RooflineModel) -> EmpiricalResult {
let bw_copy = measure_bandwidth();
let bw_triad = measure_bandwidth_triad();
let measured_bw = bw_copy.max(bw_triad);
let measured_flops = measure_peak_flops_single_core();
let theoretical_bw = theoretical
.peak_bandwidth
.get(&MemoryLevel::Dram)
.copied()
.unwrap_or(1.0);
let theoretical_flops = theoretical
.peak_compute
.get(&Precision::Fp32)
.copied()
.unwrap_or(1.0);
let cores = num_cpus::get_physical() as f64;
let single_core_theoretical = theoretical_flops / cores;
EmpiricalResult {
measured_bandwidth_bps: measured_bw,
measured_peak_flops: measured_flops,
measured_ridge_point: measured_flops / measured_bw,
bandwidth_efficiency: measured_bw / theoretical_bw * 100.0,
compute_efficiency: measured_flops / single_core_theoretical * 100.0,
}
}
fn print_roofline(model: &RooflineModel) {
println!("\n=== cgp Roofline: {} ===\n", model.target);
println!(" Peak Compute:");
let mut precisions: Vec<_> = model.peak_compute.iter().collect();
precisions.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap_or(std::cmp::Ordering::Equal));
for (prec, peak) in &precisions {
println!(" {prec:15}: {:8.1} TFLOP/s", *peak / 1e12);
}
println!("\n Peak Bandwidth:");
let mut levels: Vec<_> = model.peak_bandwidth.iter().collect();
levels.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap_or(std::cmp::Ordering::Equal));
for (level, bw) in &levels {
if **bw >= 1e12 {
println!(" {level:15}: {:8.1} TB/s", *bw / 1e12);
} else {
println!(" {level:15}: {:8.1} GB/s", *bw / 1e9);
}
}
println!("\n Ridge Points (vs DRAM):");
for (prec, _) in &precisions {
if let Some(ridge) = model.ridge_point(**prec, MemoryLevel::Dram) {
println!(" {prec:15}: {:8.1} FLOP/byte", ridge);
}
}
}
pub fn run_roofline(
target: &str,
_kernels: Option<&str>,
export: Option<&str>,
empirical: bool,
json: bool,
) -> Result<()> {
let model = match target {
"cuda" => RooflineModel::rtx_4090(),
"avx2" => {
let cores = num_cpus::get_physical();
RooflineModel::cpu_avx2(3.5, cores, 204.8)
}
"avx512" => {
let cores = num_cpus::get_physical();
RooflineModel::cpu_avx512(3.5, cores, 204.8)
}
"neon" => {
let cores = num_cpus::get_physical();
RooflineModel::cpu_neon(3.0, cores, 51.2)
}
"wgpu" => RooflineModel::rtx_4090(),
other => anyhow::bail!(
"Unknown roofline target: {other}. Supported: cuda, avx2, avx512, neon, wgpu"
),
};
if json && empirical && !target.starts_with("cuda") && target != "wgpu" {
let emp = measure_empirical(&model);
#[derive(Serialize)]
struct EmpiricalJson<'a> {
theoretical: &'a RooflineModel,
empirical: &'a EmpiricalResult,
}
let combined = EmpiricalJson {
theoretical: &model,
empirical: &emp,
};
println!("{}", serde_json::to_string_pretty(&combined)?);
return Ok(());
}
if json {
let json_str = serde_json::to_string_pretty(&model)?;
println!("{json_str}");
return Ok(());
}
print_roofline(&model);
if empirical && !target.starts_with("cuda") && target != "wgpu" {
println!("\n --- Empirical Measurement (single-core) ---\n");
let emp = measure_empirical(&model);
println!(
" DRAM Bandwidth: {:8.1} GB/s ({:.0}% of theoretical)",
emp.measured_bandwidth_bps / 1e9,
emp.bandwidth_efficiency
);
println!(
" Peak FP32 FLOPS: {:8.1} GFLOP/s (single-core, {:.0}% of theoretical)",
emp.measured_peak_flops / 1e9,
emp.compute_efficiency
);
println!(
" Empirical Ridge: {:8.1} FLOP/byte",
emp.measured_ridge_point
);
} else if empirical {
println!("\n (Empirical measurement for GPU targets requires CUDA — use cgp roofline --target avx2 --empirical for CPU)");
}
if let Some(path) = export {
let json_str = serde_json::to_string_pretty(&model)?;
std::fs::write(path, json_str)?;
println!("\n Exported to: {path}");
}
println!();
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_ridge_point_rtx4090_fp16() {
let model = RooflineModel::rtx_4090();
let ridge = model
.ridge_point(Precision::Fp16, MemoryLevel::Dram)
.unwrap();
let expected = 330_000.0 / 1008.0; assert!(
(ridge - expected).abs() < 0.5,
"Ridge point {ridge:.1} not within 0.5 of expected {expected:.1}"
);
}
#[test]
fn test_ridge_points_all_precisions() {
let model = RooflineModel::rtx_4090();
let dram_bw = 1008.0e9;
let cases = [
(Precision::Fp32, 82.6e12),
(Precision::Fp16, 330.0e12),
(Precision::Tf32, 165.0e12),
(Precision::Int8, 660.0e12),
];
for (prec, peak) in cases {
let ridge = model.ridge_point(prec, MemoryLevel::Dram).unwrap();
let expected = peak / dram_bw;
assert!(
(ridge - expected).abs() / expected < 0.001,
"{prec}: ridge {ridge:.2} != expected {expected:.2}"
);
}
}
#[test]
fn test_memory_bound_classification() {
let model = RooflineModel::rtx_4090();
let point = model
.classify(8.0, 5e12, Precision::Fp16, MemoryLevel::Dram)
.unwrap();
assert!(matches!(point.bound, Bound::Memory { .. }));
assert!(point.distance_to_ridge > 1.0);
}
#[test]
fn test_compute_bound_classification() {
let model = RooflineModel::rtx_4090();
let point = model
.classify(500.0, 300e12, Precision::Fp16, MemoryLevel::Dram)
.unwrap();
assert!(matches!(point.bound, Bound::Compute { .. }));
assert!(point.distance_to_ridge < 1.0);
}
#[test]
fn test_theoretical_peak() {
let model = RooflineModel::rtx_4090();
let low_ai = model
.theoretical_peak(8.0, Precision::Fp16, MemoryLevel::Dram)
.unwrap();
assert!((low_ai - 8.0 * 1008.0e9).abs() / low_ai < 0.001);
let high_ai = model
.theoretical_peak(500.0, Precision::Fp16, MemoryLevel::Dram)
.unwrap();
assert!((high_ai - 330.0e12).abs() / high_ai < 0.001);
}
#[test]
fn test_cpu_avx2_peak() {
let model = RooflineModel::cpu_avx2(3.5, 8, 51.2);
let fp32_peak = *model.peak_compute.get(&Precision::Fp32).unwrap();
let expected = 2.0 * 8.0 * 2.0 * 3.5e9 * 8.0; assert!(
(fp32_peak - expected).abs() / expected < 0.001,
"FP32 peak {:.1} GFLOP/s != expected {:.1} GFLOP/s",
fp32_peak / 1e9,
expected / 1e9
);
}
#[test]
fn test_rtx4090_bandwidth_spec() {
let model = RooflineModel::rtx_4090();
let dram = *model.peak_bandwidth.get(&MemoryLevel::Dram).unwrap();
assert!(
(dram - 1008.0e9).abs() < 1e6,
"DRAM bandwidth {:.1} GB/s != 1008.0 GB/s",
dram / 1e9
);
}
#[test]
fn test_empirical_bandwidth_positive() {
let bw = measure_bandwidth();
assert!(bw > 0.0, "Measured bandwidth must be positive, got {bw}");
assert!(
bw < 500.0e9,
"Single-core bandwidth {:.1} GB/s suspiciously high",
bw / 1e9
);
}
#[test]
fn test_empirical_flops_positive() {
let flops = measure_peak_flops_single_core();
assert!(flops > 0.0, "Measured FLOPS must be positive, got {flops}");
assert!(
flops > 1.0e9,
"Single-core FLOPS {:.1} GFLOP/s suspiciously low",
flops / 1e9
);
}
#[test]
fn test_empirical_ridge_plausible() {
let model = RooflineModel::cpu_avx512(3.5, 24, 204.8);
let emp = measure_empirical(&model);
assert!(
emp.measured_ridge_point > 0.0 && emp.measured_ridge_point < 1000.0,
"Empirical ridge {:.1} FLOP/byte implausible",
emp.measured_ridge_point
);
}
#[test]
fn test_triad_bandwidth_positive() {
let bw = measure_bandwidth_triad();
assert!(bw > 0.0, "Triad bandwidth must be positive, got {bw}");
}
}