Skip to main content

cgp/analysis/
roofline.rs

1//! Roofline model implementation per Williams, Waterman & Patterson (2009) [4].
2//! Supports hierarchical GPU roofline per Yang et al. (2020) [13].
3//! Uses Empirical Roofline Toolkit (ERT) methodology [6].
4
5use anyhow::Result;
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8
9/// Floating-point precision levels.
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
11pub enum Precision {
12    Fp32,
13    Fp16,
14    Tf32,
15    Int8,
16    Bf16,
17}
18
19impl std::fmt::Display for Precision {
20    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
21        match self {
22            Precision::Fp32 => write!(f, "FP32"),
23            Precision::Fp16 => write!(f, "FP16 Tensor"),
24            Precision::Tf32 => write!(f, "TF32 Tensor"),
25            Precision::Int8 => write!(f, "INT8 Tensor"),
26            Precision::Bf16 => write!(f, "BF16"),
27        }
28    }
29}
30
31/// Memory hierarchy levels for hierarchical roofline [13].
32#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
33pub enum MemoryLevel {
34    L1Cache,
35    L2Cache,
36    Dram,
37    Pcie,
38}
39
40impl std::fmt::Display for MemoryLevel {
41    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
42        match self {
43            MemoryLevel::L1Cache => write!(f, "L1 Cache"),
44            MemoryLevel::L2Cache => write!(f, "L2 Cache"),
45            MemoryLevel::Dram => write!(f, "DRAM"),
46            MemoryLevel::Pcie => write!(f, "PCIe"),
47        }
48    }
49}
50
51/// Whether a kernel is compute-bound or memory-bound.
52#[derive(Debug, Clone, Serialize, Deserialize)]
53pub enum Bound {
54    /// Below ridge point: memory bandwidth is the bottleneck.
55    Memory { bandwidth_utilization: f64 },
56    /// Above ridge point: compute throughput is the bottleneck.
57    Compute { compute_utilization: f64 },
58}
59
60/// Roofline model for a specific hardware target.
61/// Implements the Empirical Roofline Toolkit (ERT) methodology [6].
62#[derive(Debug, Clone, Serialize, Deserialize)]
63pub struct RooflineModel {
64    /// Hardware target name (e.g., "RTX 4090", "AMD EPYC AVX2")
65    pub target: String,
66    /// Peak compute throughput (FLOP/s) per precision
67    pub peak_compute: HashMap<Precision, f64>,
68    /// Peak memory bandwidth (bytes/s) per memory level
69    pub peak_bandwidth: HashMap<MemoryLevel, f64>,
70}
71
72impl RooflineModel {
73    /// Compute the ridge point for a given precision and memory level.
74    /// Ridge = peak_compute / peak_bandwidth (FLOP/byte).
75    /// This is the arithmetic intensity where the kernel transitions
76    /// from memory-bound to compute-bound.
77    pub fn ridge_point(&self, precision: Precision, mem_level: MemoryLevel) -> Option<f64> {
78        let compute = self.peak_compute.get(&precision)?;
79        let bandwidth = self.peak_bandwidth.get(&mem_level)?;
80        if *bandwidth <= 0.0 {
81            return None;
82        }
83        Some(compute / bandwidth)
84    }
85
86    /// Compute the theoretical peak throughput at a given arithmetic intensity.
87    /// throughput = min(peak_compute, AI * peak_bandwidth)
88    pub fn theoretical_peak(
89        &self,
90        arithmetic_intensity: f64,
91        precision: Precision,
92        mem_level: MemoryLevel,
93    ) -> Option<f64> {
94        let compute = self.peak_compute.get(&precision)?;
95        let bandwidth = self.peak_bandwidth.get(&mem_level)?;
96        Some(compute.min(arithmetic_intensity * bandwidth))
97    }
98
99    /// Classify a kernel as compute-bound or memory-bound.
100    pub fn classify(
101        &self,
102        arithmetic_intensity: f64,
103        achieved_throughput: f64,
104        precision: Precision,
105        mem_level: MemoryLevel,
106    ) -> Option<KernelRooflinePoint> {
107        let ridge = self.ridge_point(precision, mem_level)?;
108        let peak = self.theoretical_peak(arithmetic_intensity, precision, mem_level)?;
109        let peak_compute = *self.peak_compute.get(&precision)?;
110
111        let bound = if arithmetic_intensity < ridge {
112            Bound::Memory {
113                bandwidth_utilization: achieved_throughput / peak * 100.0,
114            }
115        } else {
116            Bound::Compute {
117                compute_utilization: achieved_throughput / peak_compute * 100.0,
118            }
119        };
120
121        let efficiency = if peak > 0.0 {
122            achieved_throughput / peak * 100.0
123        } else {
124            0.0
125        };
126
127        let distance_to_ridge = if arithmetic_intensity > 0.0 {
128            ridge / arithmetic_intensity
129        } else {
130            f64::INFINITY
131        };
132
133        Some(KernelRooflinePoint {
134            arithmetic_intensity,
135            achieved_throughput,
136            peak_throughput: peak,
137            efficiency,
138            bound,
139            distance_to_ridge,
140        })
141    }
142
143    /// Create the RTX 4090 roofline model with spec values.
144    pub fn rtx_4090() -> Self {
145        let mut peak_compute = HashMap::new();
146        peak_compute.insert(Precision::Fp32, 82.6e12); // 82.6 TFLOP/s
147        peak_compute.insert(Precision::Fp16, 330.0e12); // 330 TFLOP/s (Tensor)
148        peak_compute.insert(Precision::Tf32, 165.0e12); // 165 TFLOP/s (Tensor)
149        peak_compute.insert(Precision::Int8, 660.0e12); // 660 TOP/s (Tensor)
150
151        let mut peak_bandwidth = HashMap::new();
152        peak_bandwidth.insert(MemoryLevel::L1Cache, 19.0e12); // ~19 TB/s
153        peak_bandwidth.insert(MemoryLevel::L2Cache, 5.3e12); // ~5.3 TB/s
154        peak_bandwidth.insert(MemoryLevel::Dram, 1008.0e9); // 1008 GB/s
155        peak_bandwidth.insert(MemoryLevel::Pcie, 32.0e9); // 32 GB/s PCIe 4.0 x16
156
157        RooflineModel {
158            target: "NVIDIA GeForce RTX 4090 (SM 8.9)".to_string(),
159            peak_compute,
160            peak_bandwidth,
161        }
162    }
163
164    /// Create a CPU AVX2+FMA roofline model.
165    /// Assumes dual 256-bit FMA units (e.g., AMD EPYC / Intel Skylake).
166    pub fn cpu_avx2(freq_ghz: f64, cores: usize, mem_bandwidth_gbps: f64) -> Self {
167        // FP32: 2 FMA units * 8 floats * 2 (FMA = mul + add) * freq * cores
168        let fp32_peak = 2.0 * 8.0 * 2.0 * freq_ghz * 1e9 * cores as f64;
169
170        let mut peak_compute = HashMap::new();
171        peak_compute.insert(Precision::Fp32, fp32_peak);
172
173        let mut peak_bandwidth = HashMap::new();
174        peak_bandwidth.insert(MemoryLevel::Dram, mem_bandwidth_gbps * 1e9);
175
176        RooflineModel {
177            target: format!("CPU AVX2+FMA ({cores} cores @ {freq_ghz} GHz)"),
178            peak_compute,
179            peak_bandwidth,
180        }
181    }
182
183    /// Create a CPU AVX-512 roofline model.
184    /// AVX-512: 2 FMA units * 16 floats * 2 (FMA) * freq * cores.
185    pub fn cpu_avx512(freq_ghz: f64, cores: usize, mem_bandwidth_gbps: f64) -> Self {
186        let fp32_peak = 2.0 * 16.0 * 2.0 * freq_ghz * 1e9 * cores as f64;
187
188        let mut peak_compute = HashMap::new();
189        peak_compute.insert(Precision::Fp32, fp32_peak);
190
191        let mut peak_bandwidth = HashMap::new();
192        peak_bandwidth.insert(MemoryLevel::Dram, mem_bandwidth_gbps * 1e9);
193
194        RooflineModel {
195            target: format!("CPU AVX-512+FMA ({cores} cores @ {freq_ghz} GHz)"),
196            peak_compute,
197            peak_bandwidth,
198        }
199    }
200
201    /// Create an ARM NEON roofline model.
202    /// NEON: 2 FMA units * 4 floats * 2 (FMA) * freq * cores (typical A76/A78).
203    pub fn cpu_neon(freq_ghz: f64, cores: usize, mem_bandwidth_gbps: f64) -> Self {
204        let fp32_peak = 2.0 * 4.0 * 2.0 * freq_ghz * 1e9 * cores as f64;
205
206        let mut peak_compute = HashMap::new();
207        peak_compute.insert(Precision::Fp32, fp32_peak);
208
209        let mut peak_bandwidth = HashMap::new();
210        peak_bandwidth.insert(MemoryLevel::Dram, mem_bandwidth_gbps * 1e9);
211
212        RooflineModel {
213            target: format!("CPU NEON ({cores} cores @ {freq_ghz} GHz)"),
214            peak_compute,
215            peak_bandwidth,
216        }
217    }
218}
219
220/// A kernel's position on the roofline chart.
221#[derive(Debug, Clone, Serialize, Deserialize)]
222pub struct KernelRooflinePoint {
223    /// FLOPs per byte transferred
224    pub arithmetic_intensity: f64,
225    /// Achieved throughput (FLOP/s)
226    pub achieved_throughput: f64,
227    /// Roofline ceiling throughput (FLOP/s)
228    pub peak_throughput: f64,
229    /// Achieved / peak percentage
230    pub efficiency: f64,
231    /// Compute or memory bound classification
232    pub bound: Bound,
233    /// Ridge point / arithmetic_intensity (>1 = memory-bound)
234    pub distance_to_ridge: f64,
235}
236
237/// Empirical roofline measurement results.
238#[derive(Debug, Clone, Serialize, Deserialize)]
239pub struct EmpiricalResult {
240    /// Measured DRAM bandwidth (bytes/s) via STREAM-like test
241    pub measured_bandwidth_bps: f64,
242    /// Measured peak FLOPS via tight FMA loop
243    pub measured_peak_flops: f64,
244    /// Empirical ridge point (FLOP/byte)
245    pub measured_ridge_point: f64,
246    /// Theoretical vs empirical bandwidth ratio
247    pub bandwidth_efficiency: f64,
248    /// Theoretical vs empirical compute ratio
249    pub compute_efficiency: f64,
250}
251
252/// Measure actual DRAM bandwidth via STREAM-like copy test.
253/// Allocates 64 MB arrays, performs timed copy, returns bytes/s.
254fn measure_bandwidth() -> f64 {
255    const N: usize = 16 * 1024 * 1024; // 16M f32 = 64 MB
256    const ITERS: usize = 10;
257
258    let a: Vec<f32> = vec![1.0f32; N];
259    let mut b: Vec<f32> = vec![0.0f32; N];
260
261    // Warmup
262    b.copy_from_slice(&a);
263
264    let start = std::time::Instant::now();
265    for _ in 0..ITERS {
266        b.copy_from_slice(&a);
267        // Prevent dead-code elimination
268        std::hint::black_box(&b);
269    }
270    let elapsed = start.elapsed().as_secs_f64();
271
272    // Each iteration reads N f32 and writes N f32 = 2 * N * 4 bytes
273    let bytes = 2.0 * N as f64 * 4.0 * ITERS as f64;
274    bytes / elapsed
275}
276
277/// Measure actual DRAM bandwidth via STREAM-like triad: a[i] = b[i] + s * c[i].
278/// More representative than copy — exercises FMA pipeline and memory subsystem together.
279fn measure_bandwidth_triad() -> f64 {
280    const N: usize = 16 * 1024 * 1024; // 16M f32 = 64 MB per array
281    const ITERS: usize = 10;
282
283    let b: Vec<f32> = vec![1.0f32; N];
284    let c: Vec<f32> = vec![2.0f32; N];
285    let mut a: Vec<f32> = vec![0.0f32; N];
286    let s = 3.0f32;
287
288    // Warmup
289    for i in 0..N {
290        a[i] = b[i] + s * c[i];
291    }
292
293    let start = std::time::Instant::now();
294    for _ in 0..ITERS {
295        for i in 0..N {
296            a[i] = b[i] + s * c[i];
297        }
298        std::hint::black_box(&a);
299    }
300    let elapsed = start.elapsed().as_secs_f64();
301
302    // Triad: reads 2 arrays, writes 1 = 3 * N * 4 bytes per iteration
303    let bytes = 3.0 * N as f64 * 4.0 * ITERS as f64;
304    bytes / elapsed
305}
306
307/// Measure peak single-core FP32 FLOPS using AVX-512/AVX2 FMA intrinsics.
308/// Falls back to scalar if SIMD not available.
309fn measure_peak_flops_single_core() -> f64 {
310    #[cfg(target_arch = "x86_64")]
311    {
312        if std::arch::is_x86_feature_detected!("avx512f") {
313            // SAFETY: avx512f detected above
314            return unsafe { measure_peak_flops_avx512() };
315        }
316        if std::arch::is_x86_feature_detected!("avx2") && std::arch::is_x86_feature_detected!("fma")
317        {
318            // SAFETY: avx2+fma detected above
319            return unsafe { measure_peak_flops_avx2() };
320        }
321    }
322    measure_peak_flops_scalar()
323}
324
325/// Scalar fallback for peak FLOPS measurement.
326fn measure_peak_flops_scalar() -> f64 {
327    const ITERS: u64 = 500_000_000;
328    let mut a0 = 1.0f32;
329    let mut a1 = 1.1f32;
330    let mut a2 = 1.2f32;
331    let mut a3 = 1.3f32;
332    let m = 1.0000001f32;
333    let add = 0.0000001f32;
334
335    let start = std::time::Instant::now();
336    for _ in 0..ITERS {
337        a0 = a0.mul_add(m, add);
338        a1 = a1.mul_add(m, add);
339        a2 = a2.mul_add(m, add);
340        a3 = a3.mul_add(m, add);
341    }
342    let elapsed = start.elapsed().as_secs_f64();
343    std::hint::black_box(a0 + a1 + a2 + a3);
344    // 4 FMA = 8 FLOP per iteration
345    ITERS as f64 * 8.0 / elapsed
346}
347
348/// AVX2 FMA peak: 2 FMA units * 8 FP32/vec * 2 ops/FMA = 32 FLOP/cycle.
349#[cfg(target_arch = "x86_64")]
350#[target_feature(enable = "avx2,fma")]
351unsafe fn measure_peak_flops_avx2() -> f64 {
352    use std::arch::x86_64::*;
353
354    const ITERS: u64 = 100_000_000;
355
356    // 10 independent accumulators to saturate both FMA ports
357    let mut v0 = _mm256_set1_ps(1.0);
358    let mut v1 = _mm256_set1_ps(1.1);
359    let mut v2 = _mm256_set1_ps(1.2);
360    let mut v3 = _mm256_set1_ps(1.3);
361    let mut v4 = _mm256_set1_ps(1.4);
362    let mut v5 = _mm256_set1_ps(1.5);
363    let mut v6 = _mm256_set1_ps(1.6);
364    let mut v7 = _mm256_set1_ps(1.7);
365    let mut v8 = _mm256_set1_ps(1.8);
366    let mut v9 = _mm256_set1_ps(1.9);
367    let mul = _mm256_set1_ps(1.0000001);
368    let add = _mm256_set1_ps(0.0000001);
369
370    let start = std::time::Instant::now();
371    for _ in 0..ITERS {
372        // 10 vfmadd231ps: each = 8 FMA = 16 FLOP → 160 FLOP/iter
373        v0 = _mm256_fmadd_ps(v0, mul, add);
374        v1 = _mm256_fmadd_ps(v1, mul, add);
375        v2 = _mm256_fmadd_ps(v2, mul, add);
376        v3 = _mm256_fmadd_ps(v3, mul, add);
377        v4 = _mm256_fmadd_ps(v4, mul, add);
378        v5 = _mm256_fmadd_ps(v5, mul, add);
379        v6 = _mm256_fmadd_ps(v6, mul, add);
380        v7 = _mm256_fmadd_ps(v7, mul, add);
381        v8 = _mm256_fmadd_ps(v8, mul, add);
382        v9 = _mm256_fmadd_ps(v9, mul, add);
383    }
384    let elapsed = start.elapsed().as_secs_f64();
385    // Prevent dead-code elimination
386    let sum = _mm256_add_ps(v0, v1);
387    let sum = _mm256_add_ps(sum, v2);
388    let sum = _mm256_add_ps(sum, v3);
389    let sum = _mm256_add_ps(sum, v4);
390    let sum = _mm256_add_ps(sum, v5);
391    let sum = _mm256_add_ps(sum, v6);
392    let sum = _mm256_add_ps(sum, v7);
393    let sum = _mm256_add_ps(sum, v8);
394    let sum = _mm256_add_ps(sum, v9);
395    std::hint::black_box(sum);
396
397    // 10 FMAs * 8 elements * 2 ops(mul+add) = 160 FLOP per iteration
398    ITERS as f64 * 160.0 / elapsed
399}
400
401/// AVX-512 FMA peak: 2 FMA units * 16 FP32/vec * 2 ops/FMA = 64 FLOP/cycle.
402#[cfg(target_arch = "x86_64")]
403#[target_feature(enable = "avx512f")]
404unsafe fn measure_peak_flops_avx512() -> f64 {
405    use std::arch::x86_64::*;
406
407    const ITERS: u64 = 100_000_000;
408
409    // 10 independent accumulators to saturate both FMA512 ports
410    let mut v0 = _mm512_set1_ps(1.0);
411    let mut v1 = _mm512_set1_ps(1.1);
412    let mut v2 = _mm512_set1_ps(1.2);
413    let mut v3 = _mm512_set1_ps(1.3);
414    let mut v4 = _mm512_set1_ps(1.4);
415    let mut v5 = _mm512_set1_ps(1.5);
416    let mut v6 = _mm512_set1_ps(1.6);
417    let mut v7 = _mm512_set1_ps(1.7);
418    let mut v8 = _mm512_set1_ps(1.8);
419    let mut v9 = _mm512_set1_ps(1.9);
420    let mul = _mm512_set1_ps(1.0000001);
421    let add = _mm512_set1_ps(0.0000001);
422
423    let start = std::time::Instant::now();
424    for _ in 0..ITERS {
425        // 10 vfmadd231ps zmm: each = 16 FMA = 32 FLOP → 320 FLOP/iter
426        v0 = _mm512_fmadd_ps(v0, mul, add);
427        v1 = _mm512_fmadd_ps(v1, mul, add);
428        v2 = _mm512_fmadd_ps(v2, mul, add);
429        v3 = _mm512_fmadd_ps(v3, mul, add);
430        v4 = _mm512_fmadd_ps(v4, mul, add);
431        v5 = _mm512_fmadd_ps(v5, mul, add);
432        v6 = _mm512_fmadd_ps(v6, mul, add);
433        v7 = _mm512_fmadd_ps(v7, mul, add);
434        v8 = _mm512_fmadd_ps(v8, mul, add);
435        v9 = _mm512_fmadd_ps(v9, mul, add);
436    }
437    let elapsed = start.elapsed().as_secs_f64();
438    let sum = _mm512_add_ps(v0, v1);
439    let sum = _mm512_add_ps(sum, v2);
440    let sum = _mm512_add_ps(sum, v3);
441    let sum = _mm512_add_ps(sum, v4);
442    let sum = _mm512_add_ps(sum, v5);
443    let sum = _mm512_add_ps(sum, v6);
444    let sum = _mm512_add_ps(sum, v7);
445    let sum = _mm512_add_ps(sum, v8);
446    let sum = _mm512_add_ps(sum, v9);
447    std::hint::black_box(sum);
448
449    // 10 FMAs * 16 elements * 2 ops = 320 FLOP per iteration
450    ITERS as f64 * 320.0 / elapsed
451}
452
453/// Run empirical roofline measurement and return enhanced model.
454pub fn measure_empirical(theoretical: &RooflineModel) -> EmpiricalResult {
455    let bw_copy = measure_bandwidth();
456    let bw_triad = measure_bandwidth_triad();
457    // Use the better of copy and triad as the bandwidth number
458    let measured_bw = bw_copy.max(bw_triad);
459    let measured_flops = measure_peak_flops_single_core();
460
461    let theoretical_bw = theoretical
462        .peak_bandwidth
463        .get(&MemoryLevel::Dram)
464        .copied()
465        .unwrap_or(1.0);
466    let theoretical_flops = theoretical
467        .peak_compute
468        .get(&Precision::Fp32)
469        .copied()
470        .unwrap_or(1.0);
471
472    // For single-core measurement, divide theoretical by core count
473    // The theoretical model includes all cores, so single-core peak = theoretical / cores
474    let cores = num_cpus::get_physical() as f64;
475    let single_core_theoretical = theoretical_flops / cores;
476
477    EmpiricalResult {
478        measured_bandwidth_bps: measured_bw,
479        measured_peak_flops: measured_flops,
480        measured_ridge_point: measured_flops / measured_bw,
481        bandwidth_efficiency: measured_bw / theoretical_bw * 100.0,
482        compute_efficiency: measured_flops / single_core_theoretical * 100.0,
483    }
484}
485
486/// Print roofline model to stdout in human-readable format.
487fn print_roofline(model: &RooflineModel) {
488    println!("\n=== cgp Roofline: {} ===\n", model.target);
489
490    println!("  Peak Compute:");
491    let mut precisions: Vec<_> = model.peak_compute.iter().collect();
492    precisions.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap_or(std::cmp::Ordering::Equal));
493    for (prec, peak) in &precisions {
494        println!("    {prec:15}: {:8.1} TFLOP/s", *peak / 1e12);
495    }
496
497    println!("\n  Peak Bandwidth:");
498    let mut levels: Vec<_> = model.peak_bandwidth.iter().collect();
499    levels.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap_or(std::cmp::Ordering::Equal));
500    for (level, bw) in &levels {
501        if **bw >= 1e12 {
502            println!("    {level:15}: {:8.1} TB/s", *bw / 1e12);
503        } else {
504            println!("    {level:15}: {:8.1} GB/s", *bw / 1e9);
505        }
506    }
507
508    println!("\n  Ridge Points (vs DRAM):");
509    for (prec, _) in &precisions {
510        if let Some(ridge) = model.ridge_point(**prec, MemoryLevel::Dram) {
511            println!("    {prec:15}: {:8.1} FLOP/byte", ridge);
512        }
513    }
514}
515
516/// Run the `cgp roofline` command.
517pub fn run_roofline(
518    target: &str,
519    _kernels: Option<&str>,
520    export: Option<&str>,
521    empirical: bool,
522    json: bool,
523) -> Result<()> {
524    let model = match target {
525        "cuda" => RooflineModel::rtx_4090(),
526        "avx2" => {
527            let cores = num_cpus::get_physical();
528            RooflineModel::cpu_avx2(3.5, cores, 204.8)
529        }
530        "avx512" => {
531            let cores = num_cpus::get_physical();
532            RooflineModel::cpu_avx512(3.5, cores, 204.8)
533        }
534        "neon" => {
535            let cores = num_cpus::get_physical();
536            RooflineModel::cpu_neon(3.0, cores, 51.2)
537        }
538        "wgpu" => RooflineModel::rtx_4090(),
539        other => anyhow::bail!(
540            "Unknown roofline target: {other}. Supported: cuda, avx2, avx512, neon, wgpu"
541        ),
542    };
543
544    // JSON + empirical: output combined JSON only
545    if json && empirical && !target.starts_with("cuda") && target != "wgpu" {
546        let emp = measure_empirical(&model);
547        #[derive(Serialize)]
548        struct EmpiricalJson<'a> {
549            theoretical: &'a RooflineModel,
550            empirical: &'a EmpiricalResult,
551        }
552        let combined = EmpiricalJson {
553            theoretical: &model,
554            empirical: &emp,
555        };
556        println!("{}", serde_json::to_string_pretty(&combined)?);
557        return Ok(());
558    }
559
560    if json {
561        let json_str = serde_json::to_string_pretty(&model)?;
562        println!("{json_str}");
563        return Ok(());
564    }
565
566    print_roofline(&model);
567
568    if empirical && !target.starts_with("cuda") && target != "wgpu" {
569        println!("\n  --- Empirical Measurement (single-core) ---\n");
570        let emp = measure_empirical(&model);
571
572        println!(
573            "    DRAM Bandwidth:  {:8.1} GB/s  ({:.0}% of theoretical)",
574            emp.measured_bandwidth_bps / 1e9,
575            emp.bandwidth_efficiency
576        );
577        println!(
578            "    Peak FP32 FLOPS: {:8.1} GFLOP/s (single-core, {:.0}% of theoretical)",
579            emp.measured_peak_flops / 1e9,
580            emp.compute_efficiency
581        );
582        println!(
583            "    Empirical Ridge: {:8.1} FLOP/byte",
584            emp.measured_ridge_point
585        );
586    } else if empirical {
587        println!("\n  (Empirical measurement for GPU targets requires CUDA — use cgp roofline --target avx2 --empirical for CPU)");
588    }
589
590    if let Some(path) = export {
591        let json_str = serde_json::to_string_pretty(&model)?;
592        std::fs::write(path, json_str)?;
593        println!("\n  Exported to: {path}");
594    }
595
596    println!();
597    Ok(())
598}
599
600#[cfg(test)]
601mod tests {
602    use super::*;
603
604    /// FALSIFY-CGP-021: Ridge point must be correctly computed.
605    /// Given: peak_compute = 330 TFLOP/s, peak_bandwidth = 1008 GB/s
606    /// Then: ridge_point = 330000 / 1008 = 327.4 FLOP/byte (within 1%)
607    #[test]
608    fn test_ridge_point_rtx4090_fp16() {
609        let model = RooflineModel::rtx_4090();
610        let ridge = model
611            .ridge_point(Precision::Fp16, MemoryLevel::Dram)
612            .unwrap();
613        let expected = 330_000.0 / 1008.0; // 327.38...
614        assert!(
615            (ridge - expected).abs() < 0.5,
616            "Ridge point {ridge:.1} not within 0.5 of expected {expected:.1}"
617        );
618    }
619
620    /// FALSIFY-CGP-021: All precision ridge points match manual calculation.
621    #[test]
622    fn test_ridge_points_all_precisions() {
623        let model = RooflineModel::rtx_4090();
624        let dram_bw = 1008.0e9;
625
626        let cases = [
627            (Precision::Fp32, 82.6e12),
628            (Precision::Fp16, 330.0e12),
629            (Precision::Tf32, 165.0e12),
630            (Precision::Int8, 660.0e12),
631        ];
632
633        for (prec, peak) in cases {
634            let ridge = model.ridge_point(prec, MemoryLevel::Dram).unwrap();
635            let expected = peak / dram_bw;
636            assert!(
637                (ridge - expected).abs() / expected < 0.001,
638                "{prec}: ridge {ridge:.2} != expected {expected:.2}"
639            );
640        }
641    }
642
643    /// FALSIFY-ROOF-002: Memory-bound kernel classified correctly.
644    #[test]
645    fn test_memory_bound_classification() {
646        let model = RooflineModel::rtx_4090();
647        // AI = 8.0 FLOP/byte, well below ridge of 327.4
648        let point = model
649            .classify(8.0, 5e12, Precision::Fp16, MemoryLevel::Dram)
650            .unwrap();
651        assert!(matches!(point.bound, Bound::Memory { .. }));
652        assert!(point.distance_to_ridge > 1.0);
653    }
654
655    /// FALSIFY-ROOF-003: Compute-bound kernel classified correctly.
656    #[test]
657    fn test_compute_bound_classification() {
658        let model = RooflineModel::rtx_4090();
659        // AI = 500.0 FLOP/byte, above ridge of 327.4
660        let point = model
661            .classify(500.0, 300e12, Precision::Fp16, MemoryLevel::Dram)
662            .unwrap();
663        assert!(matches!(point.bound, Bound::Compute { .. }));
664        assert!(point.distance_to_ridge < 1.0);
665    }
666
667    /// Theoretical peak follows min(compute, AI*bandwidth).
668    #[test]
669    fn test_theoretical_peak() {
670        let model = RooflineModel::rtx_4090();
671        // Memory-bound region: peak = AI * bandwidth
672        let low_ai = model
673            .theoretical_peak(8.0, Precision::Fp16, MemoryLevel::Dram)
674            .unwrap();
675        assert!((low_ai - 8.0 * 1008.0e9).abs() / low_ai < 0.001);
676
677        // Compute-bound region: peak = compute peak
678        let high_ai = model
679            .theoretical_peak(500.0, Precision::Fp16, MemoryLevel::Dram)
680            .unwrap();
681        assert!((high_ai - 330.0e12).abs() / high_ai < 0.001);
682    }
683
684    /// CPU AVX2 model: peak = 2 FMA units * 8 floats * 2 ops * freq * cores.
685    #[test]
686    fn test_cpu_avx2_peak() {
687        let model = RooflineModel::cpu_avx2(3.5, 8, 51.2);
688        let fp32_peak = *model.peak_compute.get(&Precision::Fp32).unwrap();
689        let expected = 2.0 * 8.0 * 2.0 * 3.5e9 * 8.0; // 896 GFLOP/s
690        assert!(
691            (fp32_peak - expected).abs() / expected < 0.001,
692            "FP32 peak {:.1} GFLOP/s != expected {:.1} GFLOP/s",
693            fp32_peak / 1e9,
694            expected / 1e9
695        );
696    }
697
698    /// RTX 4090 bandwidth spec: 384-bit * 21 Gbps = 1008 GB/s.
699    #[test]
700    fn test_rtx4090_bandwidth_spec() {
701        let model = RooflineModel::rtx_4090();
702        let dram = *model.peak_bandwidth.get(&MemoryLevel::Dram).unwrap();
703        assert!(
704            (dram - 1008.0e9).abs() < 1e6,
705            "DRAM bandwidth {:.1} GB/s != 1008.0 GB/s",
706            dram / 1e9
707        );
708    }
709
710    /// FALSIFY-CGP-EMPIRICAL-001: Empirical bandwidth must be > 0 and < theoretical.
711    #[test]
712    fn test_empirical_bandwidth_positive() {
713        let bw = measure_bandwidth();
714        assert!(bw > 0.0, "Measured bandwidth must be positive, got {bw}");
715        // Single-core bandwidth should be less than full system theoretical
716        assert!(
717            bw < 500.0e9,
718            "Single-core bandwidth {:.1} GB/s suspiciously high",
719            bw / 1e9
720        );
721    }
722
723    /// FALSIFY-CGP-EMPIRICAL-002: Empirical FLOPS must be > 0.
724    #[test]
725    fn test_empirical_flops_positive() {
726        let flops = measure_peak_flops_single_core();
727        assert!(flops > 0.0, "Measured FLOPS must be positive, got {flops}");
728        // Single-core should be at least 1 GFLOP/s on any modern CPU
729        assert!(
730            flops > 1.0e9,
731            "Single-core FLOPS {:.1} GFLOP/s suspiciously low",
732            flops / 1e9
733        );
734    }
735
736    /// FALSIFY-CGP-EMPIRICAL-003: Empirical ridge point must be plausible.
737    #[test]
738    fn test_empirical_ridge_plausible() {
739        let model = RooflineModel::cpu_avx512(3.5, 24, 204.8);
740        let emp = measure_empirical(&model);
741        // Ridge point should be > 0 and < 1000 FLOP/byte for any CPU
742        assert!(
743            emp.measured_ridge_point > 0.0 && emp.measured_ridge_point < 1000.0,
744            "Empirical ridge {:.1} FLOP/byte implausible",
745            emp.measured_ridge_point
746        );
747    }
748
749    /// FALSIFY-CGP-EMPIRICAL-004: Triad bandwidth should be > 0.
750    #[test]
751    fn test_triad_bandwidth_positive() {
752        let bw = measure_bandwidth_triad();
753        assert!(bw > 0.0, "Triad bandwidth must be positive, got {bw}");
754    }
755}