1#[derive(Debug, Clone, Copy, PartialEq, Eq)]
23pub enum BottleneckType {
24 MemoryBound,
26 ComputeBound,
28 Balanced,
30}
31
32impl BottleneckType {
33 pub fn recommendation(&self) -> &'static str {
35 match self {
36 BottleneckType::MemoryBound => {
37 "Improve memory access patterns: coalescing, prefetching, cache blocking"
38 }
39 BottleneckType::ComputeBound => {
40 "Improve compute efficiency: SIMD, kernel fusion, algorithm optimization"
41 }
42 BottleneckType::Balanced => {
43 "Both memory and compute matter equally; profile to find specific bottleneck"
44 }
45 }
46 }
47
48 pub fn name(&self) -> &'static str {
50 match self {
51 BottleneckType::MemoryBound => "memory-bound",
52 BottleneckType::ComputeBound => "compute-bound",
53 BottleneckType::Balanced => "balanced",
54 }
55 }
56}
57
58#[derive(Debug, Clone)]
60pub struct HardwareProfile {
61 pub name: String,
63 pub peak_gflops: f64,
65 pub peak_bandwidth_gbps: f64,
67 ridge_point: f64,
69}
70
71impl HardwareProfile {
72 pub fn new(name: &str, peak_gflops: f64, peak_bandwidth_gbps: f64) -> Self {
74 let ridge_point = if peak_bandwidth_gbps > 0.0 {
75 peak_gflops / peak_bandwidth_gbps
76 } else {
77 0.0
78 };
79 Self {
80 name: name.to_string(),
81 peak_gflops,
82 peak_bandwidth_gbps,
83 ridge_point,
84 }
85 }
86
87 pub fn ridge_point(&self) -> f64 {
89 self.ridge_point
90 }
91
92 pub fn theoretical_peak_at_oi(&self, operational_intensity: f64) -> f64 {
94 let memory_bound_peak = self.peak_bandwidth_gbps * operational_intensity;
96 self.peak_gflops.min(memory_bound_peak)
97 }
98
99 pub fn classify_bottleneck(&self, operational_intensity: f64) -> BottleneckType {
101 let ratio = operational_intensity / self.ridge_point;
102
103 if ratio < 0.9 {
104 BottleneckType::MemoryBound
105 } else if ratio > 1.1 {
106 BottleneckType::ComputeBound
107 } else {
108 BottleneckType::Balanced
109 }
110 }
111}
112
113pub mod profiles {
115 use super::HardwareProfile;
116
117 pub fn a100_sxm() -> HardwareProfile {
119 HardwareProfile::new("NVIDIA A100 SXM", 19_500.0, 2_039.0)
120 }
121
122 pub fn h100_sxm() -> HardwareProfile {
124 HardwareProfile::new("NVIDIA H100 SXM", 51_200.0, 3_350.0)
125 }
126
127 pub fn rtx_4090() -> HardwareProfile {
129 HardwareProfile::new("NVIDIA RTX 4090", 82_580.0, 1_008.0)
130 }
131
132 pub fn rtx_3090() -> HardwareProfile {
134 HardwareProfile::new("NVIDIA RTX 3090", 35_580.0, 936.0)
135 }
136
137 pub fn mi250x() -> HardwareProfile {
139 HardwareProfile::new("AMD Instinct MI250X", 47_872.0, 3_277.0)
140 }
141
142 pub fn avx512_per_core() -> HardwareProfile {
144 HardwareProfile::new("AVX-512 (per core)", 128.0, 50.0)
145 }
146
147 pub fn m2_ultra_gpu() -> HardwareProfile {
149 HardwareProfile::new("Apple M2 Ultra GPU", 27_200.0, 800.0)
150 }
151
152 pub fn all() -> Vec<HardwareProfile> {
154 vec![
155 a100_sxm(),
156 h100_sxm(),
157 rtx_4090(),
158 rtx_3090(),
159 mi250x(),
160 avx512_per_core(),
161 m2_ultra_gpu(),
162 ]
163 }
164}
165
166#[derive(Debug, Clone)]
168pub struct WorkloadMetrics {
169 pub name: String,
171 pub total_flops: f64,
173 pub total_bytes: f64,
175 pub measured_gflops: f64,
177 pub execution_time_s: f64,
179}
180
181impl WorkloadMetrics {
182 pub fn new(name: &str, total_flops: f64, total_bytes: f64, execution_time_s: f64) -> Self {
184 let measured_gflops = if execution_time_s > 0.0 {
185 total_flops / execution_time_s / 1e9
186 } else {
187 0.0
188 };
189 Self {
190 name: name.to_string(),
191 total_flops,
192 total_bytes,
193 measured_gflops,
194 execution_time_s,
195 }
196 }
197
198 pub fn operational_intensity(&self) -> f64 {
200 if self.total_bytes > 0.0 {
201 self.total_flops / self.total_bytes
202 } else {
203 0.0
204 }
205 }
206}
207
208#[derive(Debug, Clone)]
210pub struct RooflineAnalysis {
211 pub hardware: HardwareProfile,
213 pub workload: WorkloadMetrics,
215 pub operational_intensity: f64,
217 pub theoretical_peak: f64,
219 pub attained_efficiency: f64,
221 pub bottleneck: BottleneckType,
223}
224
225impl RooflineAnalysis {
226 pub fn analyze(hardware: &HardwareProfile, workload: &WorkloadMetrics) -> Self {
228 let operational_intensity = workload.operational_intensity();
229 let theoretical_peak = hardware.theoretical_peak_at_oi(operational_intensity);
230 let attained_efficiency = if theoretical_peak > 0.0 {
231 (workload.measured_gflops / theoretical_peak) * 100.0
232 } else {
233 0.0
234 };
235 let bottleneck = hardware.classify_bottleneck(operational_intensity);
236
237 Self {
238 hardware: hardware.clone(),
239 workload: workload.clone(),
240 operational_intensity,
241 theoretical_peak,
242 attained_efficiency,
243 bottleneck,
244 }
245 }
246
247 pub fn recommendation(&self) -> String {
249 let base = self.bottleneck.recommendation();
250 format!(
251 "{} (OI={:.2} FLOP/Byte, Ridge={:.2}, Efficiency={:.1}%)",
252 base,
253 self.operational_intensity,
254 self.hardware.ridge_point(),
255 self.attained_efficiency
256 )
257 }
258}
259
260#[derive(Debug, Clone)]
262pub struct RooflinePlotPoint {
263 pub log_oi: f64,
265 pub log_perf: f64,
267 pub oi: f64,
269 pub perf: f64,
271 pub label: String,
273}
274
275impl RooflinePlotPoint {
276 pub fn new(label: &str, oi: f64, perf: f64) -> Self {
278 Self {
279 log_oi: oi.log2(),
280 log_perf: perf.log2(),
281 oi,
282 perf,
283 label: label.to_string(),
284 }
285 }
286}
287
288#[derive(Debug, Clone)]
290pub struct RooflinePlot {
291 pub hardware: HardwareProfile,
293 pub memory_bound_line: Vec<RooflinePlotPoint>,
295 pub compute_bound_line: Vec<RooflinePlotPoint>,
297 pub workload_points: Vec<RooflinePlotPoint>,
299 pub ridge_point: RooflinePlotPoint,
301}
302
303impl RooflinePlot {
304 pub fn generate(hardware: &HardwareProfile, workloads: &[WorkloadMetrics]) -> Self {
306 let ridge = hardware.ridge_point();
307
308 let memory_bound_line: Vec<RooflinePlotPoint> = (0..=20)
310 .map(|i| {
311 let oi = 0.1 * (ridge / 0.1).powf(i as f64 / 20.0);
312 let perf = hardware.peak_bandwidth_gbps * oi;
313 RooflinePlotPoint::new("memory-bound", oi, perf)
314 })
315 .collect();
316
317 let compute_bound_line: Vec<RooflinePlotPoint> = (0..=10)
319 .map(|i| {
320 let oi = ridge * (100.0 / ridge).powf(i as f64 / 10.0);
321 RooflinePlotPoint::new("compute-bound", oi, hardware.peak_gflops)
322 })
323 .collect();
324
325 let workload_points: Vec<RooflinePlotPoint> = workloads
327 .iter()
328 .map(|w| RooflinePlotPoint::new(&w.name, w.operational_intensity(), w.measured_gflops))
329 .collect();
330
331 let ridge_point = RooflinePlotPoint::new("ridge", ridge, hardware.peak_gflops);
333
334 Self {
335 hardware: hardware.clone(),
336 memory_bound_line,
337 compute_bound_line,
338 workload_points,
339 ridge_point,
340 }
341 }
342}
343
344#[derive(Debug)]
346pub struct BatchRooflineAnalysis {
347 pub hardware: HardwareProfile,
349 pub analyses: Vec<RooflineAnalysis>,
351}
352
353impl BatchRooflineAnalysis {
354 pub fn analyze(hardware: &HardwareProfile, workloads: &[WorkloadMetrics]) -> Self {
356 let analyses = workloads
357 .iter()
358 .map(|w| RooflineAnalysis::analyze(hardware, w))
359 .collect();
360 Self {
361 hardware: hardware.clone(),
362 analyses,
363 }
364 }
365
366 pub fn summary(&self) -> BatchSummary {
368 let memory_bound = self
369 .analyses
370 .iter()
371 .filter(|a| a.bottleneck == BottleneckType::MemoryBound)
372 .count();
373 let compute_bound = self
374 .analyses
375 .iter()
376 .filter(|a| a.bottleneck == BottleneckType::ComputeBound)
377 .count();
378 let balanced = self
379 .analyses
380 .iter()
381 .filter(|a| a.bottleneck == BottleneckType::Balanced)
382 .count();
383 let avg_efficiency = if self.analyses.is_empty() {
384 0.0
385 } else {
386 self.analyses
387 .iter()
388 .map(|a| a.attained_efficiency)
389 .sum::<f64>()
390 / self.analyses.len() as f64
391 };
392
393 BatchSummary {
394 total: self.analyses.len(),
395 memory_bound,
396 compute_bound,
397 balanced,
398 avg_efficiency,
399 }
400 }
401}
402
403#[derive(Debug, Clone)]
405pub struct BatchSummary {
406 pub total: usize,
408 pub memory_bound: usize,
410 pub compute_bound: usize,
412 pub balanced: usize,
414 pub avg_efficiency: f64,
416}
417
418#[cfg(test)]
419mod tests {
420 use super::*;
421
422 #[test]
423 fn test_ridge_point_calculation() {
424 let profile = HardwareProfile::new("Test", 1000.0, 100.0);
425 assert!((profile.ridge_point() - 10.0).abs() < 0.01);
426 }
427
428 #[test]
429 fn test_bottleneck_classification_memory_bound() {
430 let profile = HardwareProfile::new("Test", 1000.0, 100.0);
431 assert_eq!(
433 profile.classify_bottleneck(5.0),
434 BottleneckType::MemoryBound
435 );
436 }
437
438 #[test]
439 fn test_bottleneck_classification_compute_bound() {
440 let profile = HardwareProfile::new("Test", 1000.0, 100.0);
441 assert_eq!(
443 profile.classify_bottleneck(20.0),
444 BottleneckType::ComputeBound
445 );
446 }
447
448 #[test]
449 fn test_bottleneck_classification_balanced() {
450 let profile = HardwareProfile::new("Test", 1000.0, 100.0);
451 assert_eq!(profile.classify_bottleneck(10.0), BottleneckType::Balanced);
453 }
454
455 #[test]
456 fn test_operational_intensity() {
457 let workload = WorkloadMetrics::new("test", 1000.0, 100.0, 1.0);
458 assert!((workload.operational_intensity() - 10.0).abs() < 0.01);
459 }
460
461 #[test]
462 fn test_a100_profile() {
463 let a100 = profiles::a100_sxm();
464 assert!((a100.peak_gflops - 19500.0).abs() < 1.0);
465 assert!((a100.peak_bandwidth_gbps - 2039.0).abs() < 1.0);
466 assert!((a100.ridge_point() - 9.56).abs() < 0.1);
468 }
469
470 #[test]
471 fn test_h100_profile() {
472 let h100 = profiles::h100_sxm();
473 assert!((h100.ridge_point() - 15.28).abs() < 0.1);
475 }
476
477 #[test]
478 fn test_rtx_4090_profile() {
479 let rtx4090 = profiles::rtx_4090();
480 assert!((rtx4090.ridge_point() - 81.9).abs() < 0.5);
482 }
483
484 #[test]
485 fn test_roofline_analysis() {
486 let hardware = HardwareProfile::new("Test", 1000.0, 100.0);
487 let workload = WorkloadMetrics::new("matmul", 1e9, 1e8, 0.01); let analysis = RooflineAnalysis::analyze(&hardware, &workload);
490
491 assert!((analysis.operational_intensity - 10.0).abs() < 0.01);
492 assert_eq!(analysis.bottleneck, BottleneckType::Balanced);
493 }
494}