1use anyhow::Result;
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8
9#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
11pub enum Precision {
12 Fp32,
13 Fp16,
14 Tf32,
15 Int8,
16 Bf16,
17}
18
19impl std::fmt::Display for Precision {
20 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
21 match self {
22 Precision::Fp32 => write!(f, "FP32"),
23 Precision::Fp16 => write!(f, "FP16 Tensor"),
24 Precision::Tf32 => write!(f, "TF32 Tensor"),
25 Precision::Int8 => write!(f, "INT8 Tensor"),
26 Precision::Bf16 => write!(f, "BF16"),
27 }
28 }
29}
30
31#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
33pub enum MemoryLevel {
34 L1Cache,
35 L2Cache,
36 Dram,
37 Pcie,
38}
39
40impl std::fmt::Display for MemoryLevel {
41 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
42 match self {
43 MemoryLevel::L1Cache => write!(f, "L1 Cache"),
44 MemoryLevel::L2Cache => write!(f, "L2 Cache"),
45 MemoryLevel::Dram => write!(f, "DRAM"),
46 MemoryLevel::Pcie => write!(f, "PCIe"),
47 }
48 }
49}
50
51#[derive(Debug, Clone, Serialize, Deserialize)]
53pub enum Bound {
54 Memory { bandwidth_utilization: f64 },
56 Compute { compute_utilization: f64 },
58}
59
60#[derive(Debug, Clone, Serialize, Deserialize)]
63pub struct RooflineModel {
64 pub target: String,
66 pub peak_compute: HashMap<Precision, f64>,
68 pub peak_bandwidth: HashMap<MemoryLevel, f64>,
70}
71
72impl RooflineModel {
73 pub fn ridge_point(&self, precision: Precision, mem_level: MemoryLevel) -> Option<f64> {
78 let compute = self.peak_compute.get(&precision)?;
79 let bandwidth = self.peak_bandwidth.get(&mem_level)?;
80 if *bandwidth <= 0.0 {
81 return None;
82 }
83 Some(compute / bandwidth)
84 }
85
86 pub fn theoretical_peak(
89 &self,
90 arithmetic_intensity: f64,
91 precision: Precision,
92 mem_level: MemoryLevel,
93 ) -> Option<f64> {
94 let compute = self.peak_compute.get(&precision)?;
95 let bandwidth = self.peak_bandwidth.get(&mem_level)?;
96 Some(compute.min(arithmetic_intensity * bandwidth))
97 }
98
99 pub fn classify(
101 &self,
102 arithmetic_intensity: f64,
103 achieved_throughput: f64,
104 precision: Precision,
105 mem_level: MemoryLevel,
106 ) -> Option<KernelRooflinePoint> {
107 let ridge = self.ridge_point(precision, mem_level)?;
108 let peak = self.theoretical_peak(arithmetic_intensity, precision, mem_level)?;
109 let peak_compute = *self.peak_compute.get(&precision)?;
110
111 let bound = if arithmetic_intensity < ridge {
112 Bound::Memory {
113 bandwidth_utilization: achieved_throughput / peak * 100.0,
114 }
115 } else {
116 Bound::Compute {
117 compute_utilization: achieved_throughput / peak_compute * 100.0,
118 }
119 };
120
121 let efficiency = if peak > 0.0 {
122 achieved_throughput / peak * 100.0
123 } else {
124 0.0
125 };
126
127 let distance_to_ridge = if arithmetic_intensity > 0.0 {
128 ridge / arithmetic_intensity
129 } else {
130 f64::INFINITY
131 };
132
133 Some(KernelRooflinePoint {
134 arithmetic_intensity,
135 achieved_throughput,
136 peak_throughput: peak,
137 efficiency,
138 bound,
139 distance_to_ridge,
140 })
141 }
142
143 pub fn rtx_4090() -> Self {
145 let mut peak_compute = HashMap::new();
146 peak_compute.insert(Precision::Fp32, 82.6e12); peak_compute.insert(Precision::Fp16, 330.0e12); peak_compute.insert(Precision::Tf32, 165.0e12); peak_compute.insert(Precision::Int8, 660.0e12); let mut peak_bandwidth = HashMap::new();
152 peak_bandwidth.insert(MemoryLevel::L1Cache, 19.0e12); peak_bandwidth.insert(MemoryLevel::L2Cache, 5.3e12); peak_bandwidth.insert(MemoryLevel::Dram, 1008.0e9); peak_bandwidth.insert(MemoryLevel::Pcie, 32.0e9); RooflineModel {
158 target: "NVIDIA GeForce RTX 4090 (SM 8.9)".to_string(),
159 peak_compute,
160 peak_bandwidth,
161 }
162 }
163
164 pub fn cpu_avx2(freq_ghz: f64, cores: usize, mem_bandwidth_gbps: f64) -> Self {
167 let fp32_peak = 2.0 * 8.0 * 2.0 * freq_ghz * 1e9 * cores as f64;
169
170 let mut peak_compute = HashMap::new();
171 peak_compute.insert(Precision::Fp32, fp32_peak);
172
173 let mut peak_bandwidth = HashMap::new();
174 peak_bandwidth.insert(MemoryLevel::Dram, mem_bandwidth_gbps * 1e9);
175
176 RooflineModel {
177 target: format!("CPU AVX2+FMA ({cores} cores @ {freq_ghz} GHz)"),
178 peak_compute,
179 peak_bandwidth,
180 }
181 }
182
183 pub fn cpu_avx512(freq_ghz: f64, cores: usize, mem_bandwidth_gbps: f64) -> Self {
186 let fp32_peak = 2.0 * 16.0 * 2.0 * freq_ghz * 1e9 * cores as f64;
187
188 let mut peak_compute = HashMap::new();
189 peak_compute.insert(Precision::Fp32, fp32_peak);
190
191 let mut peak_bandwidth = HashMap::new();
192 peak_bandwidth.insert(MemoryLevel::Dram, mem_bandwidth_gbps * 1e9);
193
194 RooflineModel {
195 target: format!("CPU AVX-512+FMA ({cores} cores @ {freq_ghz} GHz)"),
196 peak_compute,
197 peak_bandwidth,
198 }
199 }
200
201 pub fn cpu_neon(freq_ghz: f64, cores: usize, mem_bandwidth_gbps: f64) -> Self {
204 let fp32_peak = 2.0 * 4.0 * 2.0 * freq_ghz * 1e9 * cores as f64;
205
206 let mut peak_compute = HashMap::new();
207 peak_compute.insert(Precision::Fp32, fp32_peak);
208
209 let mut peak_bandwidth = HashMap::new();
210 peak_bandwidth.insert(MemoryLevel::Dram, mem_bandwidth_gbps * 1e9);
211
212 RooflineModel {
213 target: format!("CPU NEON ({cores} cores @ {freq_ghz} GHz)"),
214 peak_compute,
215 peak_bandwidth,
216 }
217 }
218}
219
220#[derive(Debug, Clone, Serialize, Deserialize)]
222pub struct KernelRooflinePoint {
223 pub arithmetic_intensity: f64,
225 pub achieved_throughput: f64,
227 pub peak_throughput: f64,
229 pub efficiency: f64,
231 pub bound: Bound,
233 pub distance_to_ridge: f64,
235}
236
237#[derive(Debug, Clone, Serialize, Deserialize)]
239pub struct EmpiricalResult {
240 pub measured_bandwidth_bps: f64,
242 pub measured_peak_flops: f64,
244 pub measured_ridge_point: f64,
246 pub bandwidth_efficiency: f64,
248 pub compute_efficiency: f64,
250}
251
252fn measure_bandwidth() -> f64 {
255 const N: usize = 16 * 1024 * 1024; const ITERS: usize = 10;
257
258 let a: Vec<f32> = vec![1.0f32; N];
259 let mut b: Vec<f32> = vec![0.0f32; N];
260
261 b.copy_from_slice(&a);
263
264 let start = std::time::Instant::now();
265 for _ in 0..ITERS {
266 b.copy_from_slice(&a);
267 std::hint::black_box(&b);
269 }
270 let elapsed = start.elapsed().as_secs_f64();
271
272 let bytes = 2.0 * N as f64 * 4.0 * ITERS as f64;
274 bytes / elapsed
275}
276
277fn measure_bandwidth_triad() -> f64 {
280 const N: usize = 16 * 1024 * 1024; const ITERS: usize = 10;
282
283 let b: Vec<f32> = vec![1.0f32; N];
284 let c: Vec<f32> = vec![2.0f32; N];
285 let mut a: Vec<f32> = vec![0.0f32; N];
286 let s = 3.0f32;
287
288 for i in 0..N {
290 a[i] = b[i] + s * c[i];
291 }
292
293 let start = std::time::Instant::now();
294 for _ in 0..ITERS {
295 for i in 0..N {
296 a[i] = b[i] + s * c[i];
297 }
298 std::hint::black_box(&a);
299 }
300 let elapsed = start.elapsed().as_secs_f64();
301
302 let bytes = 3.0 * N as f64 * 4.0 * ITERS as f64;
304 bytes / elapsed
305}
306
307fn measure_peak_flops_single_core() -> f64 {
310 #[cfg(target_arch = "x86_64")]
311 {
312 if std::arch::is_x86_feature_detected!("avx512f") {
313 return unsafe { measure_peak_flops_avx512() };
315 }
316 if std::arch::is_x86_feature_detected!("avx2") && std::arch::is_x86_feature_detected!("fma")
317 {
318 return unsafe { measure_peak_flops_avx2() };
320 }
321 }
322 measure_peak_flops_scalar()
323}
324
325fn measure_peak_flops_scalar() -> f64 {
327 const ITERS: u64 = 500_000_000;
328 let mut a0 = 1.0f32;
329 let mut a1 = 1.1f32;
330 let mut a2 = 1.2f32;
331 let mut a3 = 1.3f32;
332 let m = 1.0000001f32;
333 let add = 0.0000001f32;
334
335 let start = std::time::Instant::now();
336 for _ in 0..ITERS {
337 a0 = a0.mul_add(m, add);
338 a1 = a1.mul_add(m, add);
339 a2 = a2.mul_add(m, add);
340 a3 = a3.mul_add(m, add);
341 }
342 let elapsed = start.elapsed().as_secs_f64();
343 std::hint::black_box(a0 + a1 + a2 + a3);
344 ITERS as f64 * 8.0 / elapsed
346}
347
348#[cfg(target_arch = "x86_64")]
350#[target_feature(enable = "avx2,fma")]
351unsafe fn measure_peak_flops_avx2() -> f64 {
352 use std::arch::x86_64::*;
353
354 const ITERS: u64 = 100_000_000;
355
356 let mut v0 = _mm256_set1_ps(1.0);
358 let mut v1 = _mm256_set1_ps(1.1);
359 let mut v2 = _mm256_set1_ps(1.2);
360 let mut v3 = _mm256_set1_ps(1.3);
361 let mut v4 = _mm256_set1_ps(1.4);
362 let mut v5 = _mm256_set1_ps(1.5);
363 let mut v6 = _mm256_set1_ps(1.6);
364 let mut v7 = _mm256_set1_ps(1.7);
365 let mut v8 = _mm256_set1_ps(1.8);
366 let mut v9 = _mm256_set1_ps(1.9);
367 let mul = _mm256_set1_ps(1.0000001);
368 let add = _mm256_set1_ps(0.0000001);
369
370 let start = std::time::Instant::now();
371 for _ in 0..ITERS {
372 v0 = _mm256_fmadd_ps(v0, mul, add);
374 v1 = _mm256_fmadd_ps(v1, mul, add);
375 v2 = _mm256_fmadd_ps(v2, mul, add);
376 v3 = _mm256_fmadd_ps(v3, mul, add);
377 v4 = _mm256_fmadd_ps(v4, mul, add);
378 v5 = _mm256_fmadd_ps(v5, mul, add);
379 v6 = _mm256_fmadd_ps(v6, mul, add);
380 v7 = _mm256_fmadd_ps(v7, mul, add);
381 v8 = _mm256_fmadd_ps(v8, mul, add);
382 v9 = _mm256_fmadd_ps(v9, mul, add);
383 }
384 let elapsed = start.elapsed().as_secs_f64();
385 let sum = _mm256_add_ps(v0, v1);
387 let sum = _mm256_add_ps(sum, v2);
388 let sum = _mm256_add_ps(sum, v3);
389 let sum = _mm256_add_ps(sum, v4);
390 let sum = _mm256_add_ps(sum, v5);
391 let sum = _mm256_add_ps(sum, v6);
392 let sum = _mm256_add_ps(sum, v7);
393 let sum = _mm256_add_ps(sum, v8);
394 let sum = _mm256_add_ps(sum, v9);
395 std::hint::black_box(sum);
396
397 ITERS as f64 * 160.0 / elapsed
399}
400
401#[cfg(target_arch = "x86_64")]
403#[target_feature(enable = "avx512f")]
404unsafe fn measure_peak_flops_avx512() -> f64 {
405 use std::arch::x86_64::*;
406
407 const ITERS: u64 = 100_000_000;
408
409 let mut v0 = _mm512_set1_ps(1.0);
411 let mut v1 = _mm512_set1_ps(1.1);
412 let mut v2 = _mm512_set1_ps(1.2);
413 let mut v3 = _mm512_set1_ps(1.3);
414 let mut v4 = _mm512_set1_ps(1.4);
415 let mut v5 = _mm512_set1_ps(1.5);
416 let mut v6 = _mm512_set1_ps(1.6);
417 let mut v7 = _mm512_set1_ps(1.7);
418 let mut v8 = _mm512_set1_ps(1.8);
419 let mut v9 = _mm512_set1_ps(1.9);
420 let mul = _mm512_set1_ps(1.0000001);
421 let add = _mm512_set1_ps(0.0000001);
422
423 let start = std::time::Instant::now();
424 for _ in 0..ITERS {
425 v0 = _mm512_fmadd_ps(v0, mul, add);
427 v1 = _mm512_fmadd_ps(v1, mul, add);
428 v2 = _mm512_fmadd_ps(v2, mul, add);
429 v3 = _mm512_fmadd_ps(v3, mul, add);
430 v4 = _mm512_fmadd_ps(v4, mul, add);
431 v5 = _mm512_fmadd_ps(v5, mul, add);
432 v6 = _mm512_fmadd_ps(v6, mul, add);
433 v7 = _mm512_fmadd_ps(v7, mul, add);
434 v8 = _mm512_fmadd_ps(v8, mul, add);
435 v9 = _mm512_fmadd_ps(v9, mul, add);
436 }
437 let elapsed = start.elapsed().as_secs_f64();
438 let sum = _mm512_add_ps(v0, v1);
439 let sum = _mm512_add_ps(sum, v2);
440 let sum = _mm512_add_ps(sum, v3);
441 let sum = _mm512_add_ps(sum, v4);
442 let sum = _mm512_add_ps(sum, v5);
443 let sum = _mm512_add_ps(sum, v6);
444 let sum = _mm512_add_ps(sum, v7);
445 let sum = _mm512_add_ps(sum, v8);
446 let sum = _mm512_add_ps(sum, v9);
447 std::hint::black_box(sum);
448
449 ITERS as f64 * 320.0 / elapsed
451}
452
453pub fn measure_empirical(theoretical: &RooflineModel) -> EmpiricalResult {
455 let bw_copy = measure_bandwidth();
456 let bw_triad = measure_bandwidth_triad();
457 let measured_bw = bw_copy.max(bw_triad);
459 let measured_flops = measure_peak_flops_single_core();
460
461 let theoretical_bw = theoretical
462 .peak_bandwidth
463 .get(&MemoryLevel::Dram)
464 .copied()
465 .unwrap_or(1.0);
466 let theoretical_flops = theoretical
467 .peak_compute
468 .get(&Precision::Fp32)
469 .copied()
470 .unwrap_or(1.0);
471
472 let cores = num_cpus::get_physical() as f64;
475 let single_core_theoretical = theoretical_flops / cores;
476
477 EmpiricalResult {
478 measured_bandwidth_bps: measured_bw,
479 measured_peak_flops: measured_flops,
480 measured_ridge_point: measured_flops / measured_bw,
481 bandwidth_efficiency: measured_bw / theoretical_bw * 100.0,
482 compute_efficiency: measured_flops / single_core_theoretical * 100.0,
483 }
484}
485
486fn print_roofline(model: &RooflineModel) {
488 println!("\n=== cgp Roofline: {} ===\n", model.target);
489
490 println!(" Peak Compute:");
491 let mut precisions: Vec<_> = model.peak_compute.iter().collect();
492 precisions.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap_or(std::cmp::Ordering::Equal));
493 for (prec, peak) in &precisions {
494 println!(" {prec:15}: {:8.1} TFLOP/s", *peak / 1e12);
495 }
496
497 println!("\n Peak Bandwidth:");
498 let mut levels: Vec<_> = model.peak_bandwidth.iter().collect();
499 levels.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap_or(std::cmp::Ordering::Equal));
500 for (level, bw) in &levels {
501 if **bw >= 1e12 {
502 println!(" {level:15}: {:8.1} TB/s", *bw / 1e12);
503 } else {
504 println!(" {level:15}: {:8.1} GB/s", *bw / 1e9);
505 }
506 }
507
508 println!("\n Ridge Points (vs DRAM):");
509 for (prec, _) in &precisions {
510 if let Some(ridge) = model.ridge_point(**prec, MemoryLevel::Dram) {
511 println!(" {prec:15}: {:8.1} FLOP/byte", ridge);
512 }
513 }
514}
515
516pub fn run_roofline(
518 target: &str,
519 _kernels: Option<&str>,
520 export: Option<&str>,
521 empirical: bool,
522 json: bool,
523) -> Result<()> {
524 let model = match target {
525 "cuda" => RooflineModel::rtx_4090(),
526 "avx2" => {
527 let cores = num_cpus::get_physical();
528 RooflineModel::cpu_avx2(3.5, cores, 204.8)
529 }
530 "avx512" => {
531 let cores = num_cpus::get_physical();
532 RooflineModel::cpu_avx512(3.5, cores, 204.8)
533 }
534 "neon" => {
535 let cores = num_cpus::get_physical();
536 RooflineModel::cpu_neon(3.0, cores, 51.2)
537 }
538 "wgpu" => RooflineModel::rtx_4090(),
539 other => anyhow::bail!(
540 "Unknown roofline target: {other}. Supported: cuda, avx2, avx512, neon, wgpu"
541 ),
542 };
543
544 if json && empirical && !target.starts_with("cuda") && target != "wgpu" {
546 let emp = measure_empirical(&model);
547 #[derive(Serialize)]
548 struct EmpiricalJson<'a> {
549 theoretical: &'a RooflineModel,
550 empirical: &'a EmpiricalResult,
551 }
552 let combined = EmpiricalJson {
553 theoretical: &model,
554 empirical: &emp,
555 };
556 println!("{}", serde_json::to_string_pretty(&combined)?);
557 return Ok(());
558 }
559
560 if json {
561 let json_str = serde_json::to_string_pretty(&model)?;
562 println!("{json_str}");
563 return Ok(());
564 }
565
566 print_roofline(&model);
567
568 if empirical && !target.starts_with("cuda") && target != "wgpu" {
569 println!("\n --- Empirical Measurement (single-core) ---\n");
570 let emp = measure_empirical(&model);
571
572 println!(
573 " DRAM Bandwidth: {:8.1} GB/s ({:.0}% of theoretical)",
574 emp.measured_bandwidth_bps / 1e9,
575 emp.bandwidth_efficiency
576 );
577 println!(
578 " Peak FP32 FLOPS: {:8.1} GFLOP/s (single-core, {:.0}% of theoretical)",
579 emp.measured_peak_flops / 1e9,
580 emp.compute_efficiency
581 );
582 println!(
583 " Empirical Ridge: {:8.1} FLOP/byte",
584 emp.measured_ridge_point
585 );
586 } else if empirical {
587 println!("\n (Empirical measurement for GPU targets requires CUDA — use cgp roofline --target avx2 --empirical for CPU)");
588 }
589
590 if let Some(path) = export {
591 let json_str = serde_json::to_string_pretty(&model)?;
592 std::fs::write(path, json_str)?;
593 println!("\n Exported to: {path}");
594 }
595
596 println!();
597 Ok(())
598}
599
600#[cfg(test)]
601mod tests {
602 use super::*;
603
604 #[test]
608 fn test_ridge_point_rtx4090_fp16() {
609 let model = RooflineModel::rtx_4090();
610 let ridge = model
611 .ridge_point(Precision::Fp16, MemoryLevel::Dram)
612 .unwrap();
613 let expected = 330_000.0 / 1008.0; assert!(
615 (ridge - expected).abs() < 0.5,
616 "Ridge point {ridge:.1} not within 0.5 of expected {expected:.1}"
617 );
618 }
619
620 #[test]
622 fn test_ridge_points_all_precisions() {
623 let model = RooflineModel::rtx_4090();
624 let dram_bw = 1008.0e9;
625
626 let cases = [
627 (Precision::Fp32, 82.6e12),
628 (Precision::Fp16, 330.0e12),
629 (Precision::Tf32, 165.0e12),
630 (Precision::Int8, 660.0e12),
631 ];
632
633 for (prec, peak) in cases {
634 let ridge = model.ridge_point(prec, MemoryLevel::Dram).unwrap();
635 let expected = peak / dram_bw;
636 assert!(
637 (ridge - expected).abs() / expected < 0.001,
638 "{prec}: ridge {ridge:.2} != expected {expected:.2}"
639 );
640 }
641 }
642
643 #[test]
645 fn test_memory_bound_classification() {
646 let model = RooflineModel::rtx_4090();
647 let point = model
649 .classify(8.0, 5e12, Precision::Fp16, MemoryLevel::Dram)
650 .unwrap();
651 assert!(matches!(point.bound, Bound::Memory { .. }));
652 assert!(point.distance_to_ridge > 1.0);
653 }
654
655 #[test]
657 fn test_compute_bound_classification() {
658 let model = RooflineModel::rtx_4090();
659 let point = model
661 .classify(500.0, 300e12, Precision::Fp16, MemoryLevel::Dram)
662 .unwrap();
663 assert!(matches!(point.bound, Bound::Compute { .. }));
664 assert!(point.distance_to_ridge < 1.0);
665 }
666
667 #[test]
669 fn test_theoretical_peak() {
670 let model = RooflineModel::rtx_4090();
671 let low_ai = model
673 .theoretical_peak(8.0, Precision::Fp16, MemoryLevel::Dram)
674 .unwrap();
675 assert!((low_ai - 8.0 * 1008.0e9).abs() / low_ai < 0.001);
676
677 let high_ai = model
679 .theoretical_peak(500.0, Precision::Fp16, MemoryLevel::Dram)
680 .unwrap();
681 assert!((high_ai - 330.0e12).abs() / high_ai < 0.001);
682 }
683
684 #[test]
686 fn test_cpu_avx2_peak() {
687 let model = RooflineModel::cpu_avx2(3.5, 8, 51.2);
688 let fp32_peak = *model.peak_compute.get(&Precision::Fp32).unwrap();
689 let expected = 2.0 * 8.0 * 2.0 * 3.5e9 * 8.0; assert!(
691 (fp32_peak - expected).abs() / expected < 0.001,
692 "FP32 peak {:.1} GFLOP/s != expected {:.1} GFLOP/s",
693 fp32_peak / 1e9,
694 expected / 1e9
695 );
696 }
697
698 #[test]
700 fn test_rtx4090_bandwidth_spec() {
701 let model = RooflineModel::rtx_4090();
702 let dram = *model.peak_bandwidth.get(&MemoryLevel::Dram).unwrap();
703 assert!(
704 (dram - 1008.0e9).abs() < 1e6,
705 "DRAM bandwidth {:.1} GB/s != 1008.0 GB/s",
706 dram / 1e9
707 );
708 }
709
710 #[test]
712 fn test_empirical_bandwidth_positive() {
713 let bw = measure_bandwidth();
714 assert!(bw > 0.0, "Measured bandwidth must be positive, got {bw}");
715 assert!(
717 bw < 500.0e9,
718 "Single-core bandwidth {:.1} GB/s suspiciously high",
719 bw / 1e9
720 );
721 }
722
723 #[test]
725 fn test_empirical_flops_positive() {
726 let flops = measure_peak_flops_single_core();
727 assert!(flops > 0.0, "Measured FLOPS must be positive, got {flops}");
728 assert!(
730 flops > 1.0e9,
731 "Single-core FLOPS {:.1} GFLOP/s suspiciously low",
732 flops / 1e9
733 );
734 }
735
736 #[test]
738 fn test_empirical_ridge_plausible() {
739 let model = RooflineModel::cpu_avx512(3.5, 24, 204.8);
740 let emp = measure_empirical(&model);
741 assert!(
743 emp.measured_ridge_point > 0.0 && emp.measured_ridge_point < 1000.0,
744 "Empirical ridge {:.1} FLOP/byte implausible",
745 emp.measured_ridge_point
746 );
747 }
748
749 #[test]
751 fn test_triad_bandwidth_positive() {
752 let bw = measure_bandwidth_triad();
753 assert!(bw > 0.0, "Triad bandwidth must be positive, got {bw}");
754 }
755}