Skip to main content

cbtop/
roofline.rs

1//! Roofline Model Analyzer (PMAT-022)
2//!
3//! Implements Williams Roofline Model per Citation [70] for visual bottleneck
4//! analysis. Determines if workload is compute-bound or memory-bound based
5//! on operational intensity.
6//!
7//! # Roofline Model Components
8//!
9//! | Component | Formula | Unit |
10//! |-----------|---------|------|
11//! | Operational Intensity (OI) | FLOP / Bytes | FLOP/Byte |
12//! | Peak Compute | Theoretical GFLOPS | GFLOP/s |
13//! | Peak Memory BW | Memory bandwidth | GB/s |
14//! | Ridge Point | Peak Compute / Peak BW | FLOP/Byte |
15//!
16//! # Citations
17//!
18//! - [Williams et al. 2009] "Roofline: An Insightful Visual Performance Model" CACM 52(4)
19//! - [Ofenbeck et al. 2014] "Applying the Roofline Model" IEEE ISPASS
20
21/// Bottleneck classification based on operational intensity vs ridge point
22#[derive(Debug, Clone, Copy, PartialEq, Eq)]
23pub enum BottleneckType {
24    /// OI < Ridge Point - workload is limited by memory bandwidth
25    MemoryBound,
26    /// OI > Ridge Point - workload is limited by compute throughput
27    ComputeBound,
28    /// OI ≈ Ridge Point (within 10%) - both equally important
29    Balanced,
30}
31
32impl BottleneckType {
33    /// Get optimization recommendation
34    pub fn recommendation(&self) -> &'static str {
35        match self {
36            BottleneckType::MemoryBound => {
37                "Improve memory access patterns: coalescing, prefetching, cache blocking"
38            }
39            BottleneckType::ComputeBound => {
40                "Improve compute efficiency: SIMD, kernel fusion, algorithm optimization"
41            }
42            BottleneckType::Balanced => {
43                "Both memory and compute matter equally; profile to find specific bottleneck"
44            }
45        }
46    }
47
48    /// Get short name
49    pub fn name(&self) -> &'static str {
50        match self {
51            BottleneckType::MemoryBound => "memory-bound",
52            BottleneckType::ComputeBound => "compute-bound",
53            BottleneckType::Balanced => "balanced",
54        }
55    }
56}
57
58/// Hardware profile for roofline analysis
59#[derive(Debug, Clone)]
60pub struct HardwareProfile {
61    /// Device name
62    pub name: String,
63    /// Peak compute throughput in GFLOPS (FP32)
64    pub peak_gflops: f64,
65    /// Peak memory bandwidth in GB/s
66    pub peak_bandwidth_gbps: f64,
67    /// Ridge point (peak_gflops / peak_bandwidth_gbps)
68    ridge_point: f64,
69}
70
71impl HardwareProfile {
72    /// Create a new hardware profile
73    pub fn new(name: &str, peak_gflops: f64, peak_bandwidth_gbps: f64) -> Self {
74        let ridge_point = if peak_bandwidth_gbps > 0.0 {
75            peak_gflops / peak_bandwidth_gbps
76        } else {
77            0.0
78        };
79        Self {
80            name: name.to_string(),
81            peak_gflops,
82            peak_bandwidth_gbps,
83            ridge_point,
84        }
85    }
86
87    /// Get the ridge point (transition from memory-bound to compute-bound)
88    pub fn ridge_point(&self) -> f64 {
89        self.ridge_point
90    }
91
92    /// Calculate theoretical peak performance for a given OI
93    pub fn theoretical_peak_at_oi(&self, operational_intensity: f64) -> f64 {
94        // Roofline: min(peak_compute, peak_bandwidth * OI)
95        let memory_bound_peak = self.peak_bandwidth_gbps * operational_intensity;
96        self.peak_gflops.min(memory_bound_peak)
97    }
98
99    /// Classify bottleneck based on operational intensity
100    pub fn classify_bottleneck(&self, operational_intensity: f64) -> BottleneckType {
101        let ratio = operational_intensity / self.ridge_point;
102
103        if ratio < 0.9 {
104            BottleneckType::MemoryBound
105        } else if ratio > 1.1 {
106            BottleneckType::ComputeBound
107        } else {
108            BottleneckType::Balanced
109        }
110    }
111}
112
113/// Pre-defined hardware profiles
114pub mod profiles {
115    use super::HardwareProfile;
116
117    /// NVIDIA A100 SXM 40GB/80GB
118    pub fn a100_sxm() -> HardwareProfile {
119        HardwareProfile::new("NVIDIA A100 SXM", 19_500.0, 2_039.0)
120    }
121
122    /// NVIDIA H100 SXM 80GB
123    pub fn h100_sxm() -> HardwareProfile {
124        HardwareProfile::new("NVIDIA H100 SXM", 51_200.0, 3_350.0)
125    }
126
127    /// NVIDIA RTX 4090
128    pub fn rtx_4090() -> HardwareProfile {
129        HardwareProfile::new("NVIDIA RTX 4090", 82_580.0, 1_008.0)
130    }
131
132    /// NVIDIA RTX 3090
133    pub fn rtx_3090() -> HardwareProfile {
134        HardwareProfile::new("NVIDIA RTX 3090", 35_580.0, 936.0)
135    }
136
137    /// AMD Instinct MI250X
138    pub fn mi250x() -> HardwareProfile {
139        HardwareProfile::new("AMD Instinct MI250X", 47_872.0, 3_277.0)
140    }
141
142    /// Intel Xeon with AVX-512 (per core)
143    pub fn avx512_per_core() -> HardwareProfile {
144        HardwareProfile::new("AVX-512 (per core)", 128.0, 50.0)
145    }
146
147    /// Apple M2 Ultra GPU
148    pub fn m2_ultra_gpu() -> HardwareProfile {
149        HardwareProfile::new("Apple M2 Ultra GPU", 27_200.0, 800.0)
150    }
151
152    /// All predefined profiles
153    pub fn all() -> Vec<HardwareProfile> {
154        vec![
155            a100_sxm(),
156            h100_sxm(),
157            rtx_4090(),
158            rtx_3090(),
159            mi250x(),
160            avx512_per_core(),
161            m2_ultra_gpu(),
162        ]
163    }
164}
165
166/// Workload metrics for roofline analysis
167#[derive(Debug, Clone)]
168pub struct WorkloadMetrics {
169    /// Workload name
170    pub name: String,
171    /// Total floating-point operations
172    pub total_flops: f64,
173    /// Total bytes transferred (read + write)
174    pub total_bytes: f64,
175    /// Measured performance in GFLOPS
176    pub measured_gflops: f64,
177    /// Execution time in seconds
178    pub execution_time_s: f64,
179}
180
181impl WorkloadMetrics {
182    /// Create new workload metrics
183    pub fn new(name: &str, total_flops: f64, total_bytes: f64, execution_time_s: f64) -> Self {
184        let measured_gflops = if execution_time_s > 0.0 {
185            total_flops / execution_time_s / 1e9
186        } else {
187            0.0
188        };
189        Self {
190            name: name.to_string(),
191            total_flops,
192            total_bytes,
193            measured_gflops,
194            execution_time_s,
195        }
196    }
197
198    /// Calculate operational intensity (FLOP/Byte)
199    pub fn operational_intensity(&self) -> f64 {
200        if self.total_bytes > 0.0 {
201            self.total_flops / self.total_bytes
202        } else {
203            0.0
204        }
205    }
206}
207
208/// Roofline analysis result
209#[derive(Debug, Clone)]
210pub struct RooflineAnalysis {
211    /// Hardware profile used
212    pub hardware: HardwareProfile,
213    /// Workload metrics
214    pub workload: WorkloadMetrics,
215    /// Operational intensity
216    pub operational_intensity: f64,
217    /// Theoretical peak at this OI
218    pub theoretical_peak: f64,
219    /// Attained performance (measured / theoretical)
220    pub attained_efficiency: f64,
221    /// Bottleneck classification
222    pub bottleneck: BottleneckType,
223}
224
225impl RooflineAnalysis {
226    /// Perform roofline analysis
227    pub fn analyze(hardware: &HardwareProfile, workload: &WorkloadMetrics) -> Self {
228        let operational_intensity = workload.operational_intensity();
229        let theoretical_peak = hardware.theoretical_peak_at_oi(operational_intensity);
230        let attained_efficiency = if theoretical_peak > 0.0 {
231            (workload.measured_gflops / theoretical_peak) * 100.0
232        } else {
233            0.0
234        };
235        let bottleneck = hardware.classify_bottleneck(operational_intensity);
236
237        Self {
238            hardware: hardware.clone(),
239            workload: workload.clone(),
240            operational_intensity,
241            theoretical_peak,
242            attained_efficiency,
243            bottleneck,
244        }
245    }
246
247    /// Get actionable recommendation
248    pub fn recommendation(&self) -> String {
249        let base = self.bottleneck.recommendation();
250        format!(
251            "{} (OI={:.2} FLOP/Byte, Ridge={:.2}, Efficiency={:.1}%)",
252            base,
253            self.operational_intensity,
254            self.hardware.ridge_point(),
255            self.attained_efficiency
256        )
257    }
258}
259
260/// Roofline visualization data point
261#[derive(Debug, Clone)]
262pub struct RooflinePlotPoint {
263    /// Log2 of operational intensity (x-axis)
264    pub log_oi: f64,
265    /// Log2 of performance in GFLOPS (y-axis)
266    pub log_perf: f64,
267    /// Original OI
268    pub oi: f64,
269    /// Original performance
270    pub perf: f64,
271    /// Label
272    pub label: String,
273}
274
275impl RooflinePlotPoint {
276    /// Create a plot point
277    pub fn new(label: &str, oi: f64, perf: f64) -> Self {
278        Self {
279            log_oi: oi.log2(),
280            log_perf: perf.log2(),
281            oi,
282            perf,
283            label: label.to_string(),
284        }
285    }
286}
287
288/// Roofline plot data for visualization
289#[derive(Debug, Clone)]
290pub struct RooflinePlot {
291    /// Hardware profile
292    pub hardware: HardwareProfile,
293    /// Memory-bound line points (OI from 0.1 to ridge)
294    pub memory_bound_line: Vec<RooflinePlotPoint>,
295    /// Compute-bound line points (OI from ridge to 100)
296    pub compute_bound_line: Vec<RooflinePlotPoint>,
297    /// Workload points
298    pub workload_points: Vec<RooflinePlotPoint>,
299    /// Ridge point marker
300    pub ridge_point: RooflinePlotPoint,
301}
302
303impl RooflinePlot {
304    /// Generate roofline plot data
305    pub fn generate(hardware: &HardwareProfile, workloads: &[WorkloadMetrics]) -> Self {
306        let ridge = hardware.ridge_point();
307
308        // Memory-bound line (slope = bandwidth)
309        let memory_bound_line: Vec<RooflinePlotPoint> = (0..=20)
310            .map(|i| {
311                let oi = 0.1 * (ridge / 0.1).powf(i as f64 / 20.0);
312                let perf = hardware.peak_bandwidth_gbps * oi;
313                RooflinePlotPoint::new("memory-bound", oi, perf)
314            })
315            .collect();
316
317        // Compute-bound line (flat at peak)
318        let compute_bound_line: Vec<RooflinePlotPoint> = (0..=10)
319            .map(|i| {
320                let oi = ridge * (100.0 / ridge).powf(i as f64 / 10.0);
321                RooflinePlotPoint::new("compute-bound", oi, hardware.peak_gflops)
322            })
323            .collect();
324
325        // Workload points
326        let workload_points: Vec<RooflinePlotPoint> = workloads
327            .iter()
328            .map(|w| RooflinePlotPoint::new(&w.name, w.operational_intensity(), w.measured_gflops))
329            .collect();
330
331        // Ridge point
332        let ridge_point = RooflinePlotPoint::new("ridge", ridge, hardware.peak_gflops);
333
334        Self {
335            hardware: hardware.clone(),
336            memory_bound_line,
337            compute_bound_line,
338            workload_points,
339            ridge_point,
340        }
341    }
342}
343
344/// Batch analysis of multiple workloads
345#[derive(Debug)]
346pub struct BatchRooflineAnalysis {
347    /// Hardware profile
348    pub hardware: HardwareProfile,
349    /// Individual analyses
350    pub analyses: Vec<RooflineAnalysis>,
351}
352
353impl BatchRooflineAnalysis {
354    /// Analyze multiple workloads
355    pub fn analyze(hardware: &HardwareProfile, workloads: &[WorkloadMetrics]) -> Self {
356        let analyses = workloads
357            .iter()
358            .map(|w| RooflineAnalysis::analyze(hardware, w))
359            .collect();
360        Self {
361            hardware: hardware.clone(),
362            analyses,
363        }
364    }
365
366    /// Get summary statistics
367    pub fn summary(&self) -> BatchSummary {
368        let memory_bound = self
369            .analyses
370            .iter()
371            .filter(|a| a.bottleneck == BottleneckType::MemoryBound)
372            .count();
373        let compute_bound = self
374            .analyses
375            .iter()
376            .filter(|a| a.bottleneck == BottleneckType::ComputeBound)
377            .count();
378        let balanced = self
379            .analyses
380            .iter()
381            .filter(|a| a.bottleneck == BottleneckType::Balanced)
382            .count();
383        let avg_efficiency = if self.analyses.is_empty() {
384            0.0
385        } else {
386            self.analyses
387                .iter()
388                .map(|a| a.attained_efficiency)
389                .sum::<f64>()
390                / self.analyses.len() as f64
391        };
392
393        BatchSummary {
394            total: self.analyses.len(),
395            memory_bound,
396            compute_bound,
397            balanced,
398            avg_efficiency,
399        }
400    }
401}
402
403/// Summary of batch analysis
404#[derive(Debug, Clone)]
405pub struct BatchSummary {
406    /// Total workloads analyzed
407    pub total: usize,
408    /// Number of memory-bound workloads
409    pub memory_bound: usize,
410    /// Number of compute-bound workloads
411    pub compute_bound: usize,
412    /// Number of balanced workloads
413    pub balanced: usize,
414    /// Average attained efficiency
415    pub avg_efficiency: f64,
416}
417
418#[cfg(test)]
419mod tests {
420    use super::*;
421
422    #[test]
423    fn test_ridge_point_calculation() {
424        let profile = HardwareProfile::new("Test", 1000.0, 100.0);
425        assert!((profile.ridge_point() - 10.0).abs() < 0.01);
426    }
427
428    #[test]
429    fn test_bottleneck_classification_memory_bound() {
430        let profile = HardwareProfile::new("Test", 1000.0, 100.0);
431        // OI = 5 < Ridge = 10 → memory-bound
432        assert_eq!(
433            profile.classify_bottleneck(5.0),
434            BottleneckType::MemoryBound
435        );
436    }
437
438    #[test]
439    fn test_bottleneck_classification_compute_bound() {
440        let profile = HardwareProfile::new("Test", 1000.0, 100.0);
441        // OI = 20 > Ridge = 10 → compute-bound
442        assert_eq!(
443            profile.classify_bottleneck(20.0),
444            BottleneckType::ComputeBound
445        );
446    }
447
448    #[test]
449    fn test_bottleneck_classification_balanced() {
450        let profile = HardwareProfile::new("Test", 1000.0, 100.0);
451        // OI = 10 ≈ Ridge = 10 → balanced
452        assert_eq!(profile.classify_bottleneck(10.0), BottleneckType::Balanced);
453    }
454
455    #[test]
456    fn test_operational_intensity() {
457        let workload = WorkloadMetrics::new("test", 1000.0, 100.0, 1.0);
458        assert!((workload.operational_intensity() - 10.0).abs() < 0.01);
459    }
460
461    #[test]
462    fn test_a100_profile() {
463        let a100 = profiles::a100_sxm();
464        assert!((a100.peak_gflops - 19500.0).abs() < 1.0);
465        assert!((a100.peak_bandwidth_gbps - 2039.0).abs() < 1.0);
466        // Ridge point ≈ 9.56
467        assert!((a100.ridge_point() - 9.56).abs() < 0.1);
468    }
469
470    #[test]
471    fn test_h100_profile() {
472        let h100 = profiles::h100_sxm();
473        // Ridge point ≈ 15.28
474        assert!((h100.ridge_point() - 15.28).abs() < 0.1);
475    }
476
477    #[test]
478    fn test_rtx_4090_profile() {
479        let rtx4090 = profiles::rtx_4090();
480        // Ridge point ≈ 81.9
481        assert!((rtx4090.ridge_point() - 81.9).abs() < 0.5);
482    }
483
484    #[test]
485    fn test_roofline_analysis() {
486        let hardware = HardwareProfile::new("Test", 1000.0, 100.0);
487        let workload = WorkloadMetrics::new("matmul", 1e9, 1e8, 0.01); // OI=10, 100 GFLOPS
488
489        let analysis = RooflineAnalysis::analyze(&hardware, &workload);
490
491        assert!((analysis.operational_intensity - 10.0).abs() < 0.01);
492        assert_eq!(analysis.bottleneck, BottleneckType::Balanced);
493    }
494}