Skip to main content

cgp/analysis/
compare.rs

1//! `cgp profile compare` — Cross-backend comparison.
2//! Spec section 2.2: run the same workload across multiple backends
3//! and produce a comparison table with TFLOP/s, bandwidth, and speedup ratios.
4
5use crate::analysis::roofline::{Precision, RooflineModel};
6use anyhow::Result;
7use serde::Serialize;
8
9/// Supported backends for comparison.
10#[derive(Debug, Clone, Serialize)]
11pub struct BackendResult {
12    pub name: String,
13    pub wall_time_us: f64,
14    pub tflops: f64,
15    pub bandwidth_gbps: f64,
16    pub available: bool,
17    /// Whether data comes from actual measurement vs estimation
18    #[serde(skip_serializing_if = "std::ops::Not::not")]
19    pub measured: bool,
20}
21
22/// Compute TFLOP/s for GEMM: 2*M*N*K / time.
23fn gemm_tflops(size: u32, time_us: f64) -> f64 {
24    if time_us <= 0.0 {
25        return 0.0;
26    }
27    let flops = 2.0 * (size as f64).powi(3);
28    flops / (time_us * 1e-6) / 1e12
29}
30
31/// Try to get actual GEMM timing from benchmark_matrix_suite binary.
32/// Returns (time_us, gflops) if the binary exists and the size is benchmarked.
33fn get_actual_gemm_timing(size: u32) -> Option<(f64, f64)> {
34    let stdout = run_benchmark_suite()?;
35    let pattern = format!("Matrix Multiplication ({size}x{size}x{size})");
36    stdout
37        .lines()
38        .find(|line| line.contains(&pattern))
39        .and_then(parse_benchmark_line)
40}
41
42/// Locate and execute the benchmark_matrix_suite binary; returns stdout on success.
43fn run_benchmark_suite() -> Option<String> {
44    let candidates = [
45        "/mnt/nvme-raid0/targets/trueno/release/examples/benchmark_matrix_suite",
46        "./target/release/examples/benchmark_matrix_suite",
47    ];
48    let binary_path = candidates
49        .iter()
50        .find(|p| std::path::Path::new(p).exists())?;
51    let output = std::process::Command::new(*binary_path)
52        .stdout(std::process::Stdio::piped())
53        .stderr(std::process::Stdio::piped())
54        .output()
55        .ok()?;
56    if !output.status.success() {
57        return None;
58    }
59    Some(String::from_utf8_lossy(&output.stdout).into_owned())
60}
61
62/// Parse a single line of the form
63/// `  Matrix Multiplication (NxNxN)...     X.XX ms  (Y.YY GFLOPS)`.
64fn parse_benchmark_line(line: &str) -> Option<(f64, f64)> {
65    let after_dots = line.split("...").nth(1)?;
66    let time_ms = after_dots.split("ms").next()?.trim().parse::<f64>().ok()?;
67    let gflops = after_dots
68        .split('(')
69        .nth(1)?
70        .split(" GFLOPS")
71        .next()?
72        .trim()
73        .parse::<f64>()
74        .ok()?;
75    Some((time_ms * 1000.0, gflops))
76}
77
78/// Estimate scalar GEMM time from measured data on Threadripper 7960X.
79/// Reference GEMM: 256→11.7ms, cubic scaling.
80fn estimate_scalar_time_us(size: u32) -> f64 {
81    // Calibrated: 11.7ms at 256x256 on Threadripper 7960X
82    let ratio = (size as f64 / 256.0).powi(3);
83    11_700.0 * ratio
84}
85
86/// Estimate AVX2 BLIS single-thread GEMM from measured data.
87/// Calibrated: 256→0.57ms, 512→3.75ms, 1024→30.1ms (71 GFLOPS).
88fn estimate_avx2_time_us(size: u32) -> f64 {
89    // BLIS GEMM single-thread: ~72 GFLOPS sustained
90    let flops = 2.0 * (size as f64).powi(3);
91    let gflops = 72.0; // measured on Threadripper 7960X
92    flops / (gflops * 1e9) * 1e6
93}
94
95/// Estimate AVX-512 BLIS GEMM (slightly faster than AVX2, but clock throttle).
96/// ~80 GFLOPS measured single-thread (AVX-512 downclocking limits gains).
97fn estimate_avx512_time_us(size: u32) -> f64 {
98    let flops = 2.0 * (size as f64).powi(3);
99    let gflops = 80.0; // AVX-512 with downclocking ~10% faster than AVX2
100    flops / (gflops * 1e9) * 1e6
101}
102
103/// Estimate CUDA CTA WMMA GEMM from measured data on RTX 4090.
104/// Calibrated: 23.2us at 512x512 = 11.6 TFLOP/s.
105fn estimate_cuda_time_us(size: u32) -> f64 {
106    let ratio = (size as f64 / 512.0).powi(3);
107    23.2 * ratio
108}
109
110/// Estimate cuBLAS GEMM from measured RTX 4090 data.
111/// cuBLAS achieves ~35 TFLOP/s FP16 on RTX 4090 (~3x pure PTX).
112fn estimate_cublas_time_us(size: u32) -> f64 {
113    estimate_cuda_time_us(size) / 3.0
114}
115
116/// Measure actual cuBLAS FP16 GEMM throughput via trueno-gpu driver.
117/// Returns (time_us, tflops) or None if CUDA unavailable.
118#[cfg(feature = "cuda")]
119fn measure_cublas_gemm(size: u32) -> Option<(f64, f64)> {
120    use trueno_gpu::driver::{CublasHandle, CudaContext, CudaStream, GemmOp, GpuBuffer};
121
122    let ctx = CudaContext::new(0).ok()?;
123    let stream = CudaStream::new(&ctx).ok()?;
124    let handle = CublasHandle::new(&ctx).ok()?;
125    handle.set_stream(&stream).ok()?;
126
127    let n = size as usize;
128    let a_data = vec![0x3C00u16; n * n]; // 1.0 in FP16
129    let b_data = vec![0x3C00u16; n * n];
130    let c_data = vec![0u16; n * n];
131
132    let a_buf = GpuBuffer::from_host(&ctx, &a_data).ok()?;
133    let b_buf = GpuBuffer::from_host(&ctx, &b_data).ok()?;
134    let c_buf = GpuBuffer::from_host(&ctx, &c_data).ok()?;
135
136    // Warmup
137    for _ in 0..5 {
138        let _ = handle.gemm_f16(
139            GemmOp::NoTrans,
140            GemmOp::NoTrans,
141            n as i32,
142            n as i32,
143            n as i32,
144            1.0,
145            a_buf.as_ptr(),
146            n as i32,
147            b_buf.as_ptr(),
148            n as i32,
149            0.0,
150            c_buf.as_ptr(),
151            n as i32,
152        );
153    }
154    stream.synchronize().ok()?;
155
156    let iters: u32 = if n <= 512 {
157        200
158    } else if n <= 1024 {
159        100
160    } else {
161        30
162    };
163    let start = std::time::Instant::now();
164    for _ in 0..iters {
165        let _ = handle.gemm_f16(
166            GemmOp::NoTrans,
167            GemmOp::NoTrans,
168            n as i32,
169            n as i32,
170            n as i32,
171            1.0,
172            a_buf.as_ptr(),
173            n as i32,
174            b_buf.as_ptr(),
175            n as i32,
176            0.0,
177            c_buf.as_ptr(),
178            n as i32,
179        );
180    }
181    stream.synchronize().ok()?;
182    let elapsed = start.elapsed();
183
184    let per_call_us = elapsed.as_micros() as f64 / iters as f64;
185    let flops = 2.0 * (n as f64).powi(3);
186    let tflops = flops / (per_call_us * 1e6);
187
188    Some((per_call_us, tflops))
189}
190
191/// Measure our best PTX GEMM kernel (64×128 pipeline) on GPU.
192/// Returns (time_us, tflops) or None if CUDA unavailable.
193#[cfg(feature = "cuda")]
194fn measure_ptx_gemm(size: u32) -> Option<(f64, f64)> {
195    use std::ffi::c_void;
196    use trueno_gpu::driver::{CudaContext, CudaModule, CudaStream, GpuBuffer, LaunchConfig};
197    use trueno_gpu::kernels::build_cta64x128_mma_pipeline_fp16;
198    use trueno_gpu::ptx::PtxModule;
199
200    let ctx = CudaContext::new(0).ok()?;
201    let stream = CudaStream::new(&ctx).ok()?;
202
203    let n = size as usize;
204    let a16 = vec![0x3C00u16; n * n];
205    let b16 = vec![0x3C00u16; n * n];
206    let c32 = vec![0.0f32; n * n];
207
208    let a_buf = GpuBuffer::from_host(&ctx, &a16).ok()?;
209    let b_buf = GpuBuffer::from_host(&ctx, &b16).ok()?;
210    let c_buf = GpuBuffer::from_host(&ctx, &c32).ok()?;
211
212    let kernel = build_cta64x128_mma_pipeline_fp16(n as u32, n as u32, n as u32);
213    let ptx = PtxModule::new().target("sm_80").add_kernel(kernel).emit();
214    let mut module = CudaModule::from_ptx(&ctx, &ptx).ok()?;
215
216    let cfg = LaunchConfig {
217        grid: (((n + 127) / 128) as u32, ((n + 63) / 64) as u32, 1),
218        block: (512, 1, 1),
219        shared_mem: 18432,
220    };
221
222    let mut a_ptr = a_buf.as_ptr();
223    let mut b_ptr = b_buf.as_ptr();
224    let mut c_ptr = c_buf.as_ptr();
225    let mut m_v = n as u32;
226    let mut n_v = n as u32;
227    let mut k_v = n as u32;
228    let mut args: Vec<*mut c_void> = vec![
229        &mut a_ptr as *mut _ as *mut c_void,
230        &mut b_ptr as *mut _ as *mut c_void,
231        &mut c_ptr as *mut _ as *mut c_void,
232        &mut m_v as *mut _ as *mut c_void,
233        &mut n_v as *mut _ as *mut c_void,
234        &mut k_v as *mut _ as *mut c_void,
235    ];
236
237    // Warmup
238    for _ in 0..5 {
239        unsafe {
240            stream
241                .launch_kernel(
242                    &mut module,
243                    "gemm_cta64x128_mma_pipeline_fp16",
244                    &cfg,
245                    &mut args,
246                )
247                .ok()?;
248        }
249    }
250    stream.synchronize().ok()?;
251
252    let iters: u32 = if n <= 512 {
253        100
254    } else if n <= 1024 {
255        50
256    } else {
257        20
258    };
259    let start = std::time::Instant::now();
260    for _ in 0..iters {
261        unsafe {
262            stream
263                .launch_kernel(
264                    &mut module,
265                    "gemm_cta64x128_mma_pipeline_fp16",
266                    &cfg,
267                    &mut args,
268                )
269                .ok()?;
270        }
271    }
272    stream.synchronize().ok()?;
273    let per_call_us = start.elapsed().as_micros() as f64 / iters as f64;
274    let flops = 2.0 * (n as f64).powi(3);
275    let tflops = flops / (per_call_us * 1e6);
276
277    Some((per_call_us, tflops))
278}
279
280/// Run cross-backend comparison.
281pub fn run_compare(kernel: &str, size: u32, backends_str: &str, json: bool) -> Result<()> {
282    let backends: Vec<&str> = backends_str.split(',').map(|s| s.trim()).collect();
283
284    if !json {
285        println!("\n=== CGP Cross-Backend Comparison: {kernel} ({size}x{size}x{size}) ===\n");
286    }
287
288    let actual = get_actual_gemm_timing(size);
289    let mut results = collect_backend_results(&backends, size, actual);
290    results.sort_by(|a, b| {
291        a.wall_time_us
292            .partial_cmp(&b.wall_time_us)
293            .unwrap_or(std::cmp::Ordering::Equal)
294    });
295
296    if json {
297        println!("{}", serde_json::to_string_pretty(&results)?);
298        return Ok(());
299    }
300
301    render_comparison_table(&results);
302    render_source_legend(&results);
303    render_best_summary(&results);
304    render_cpu_gpu_gap(&results);
305
306    println!();
307    Ok(())
308}
309
310/// Measure each requested backend and build a `BackendResult` list.
311fn collect_backend_results(
312    backends: &[&str],
313    size: u32,
314    actual: Option<(f64, f64)>,
315) -> Vec<BackendResult> {
316    let mut results: Vec<BackendResult> = Vec::new();
317    for backend in backends {
318        let Some((time_us, available, measured)) = measure_backend(backend, size, actual) else {
319            continue;
320        };
321        results.push(BackendResult {
322            name: (*backend).to_string(),
323            wall_time_us: time_us,
324            tflops: gemm_tflops(size, time_us),
325            bandwidth_gbps: 0.0,
326            available,
327            measured,
328        });
329    }
330    results
331}
332
333/// Dispatch to the backend-specific measurement routine; None for unknown backends.
334fn measure_backend(
335    backend: &str,
336    size: u32,
337    actual: Option<(f64, f64)>,
338) -> Option<(f64, bool, bool)> {
339    match backend {
340        "scalar" => Some((estimate_scalar_time_us(size), true, false)),
341        "avx2" => Some(measure_avx_backend(size, actual, false)),
342        "avx512" => Some(measure_avx_backend(size, actual, true)),
343        "neon" => Some((
344            estimate_scalar_time_us(size) / 4.0,
345            cfg!(target_arch = "aarch64"),
346            false,
347        )),
348        "cuda" => Some(measure_cuda_backend(size)),
349        "cublas" => Some(measure_cublas_backend(size)),
350        "wgpu" => Some((
351            estimate_cuda_time_us(size) * 2.0,
352            which::which("nvidia-smi").is_ok(),
353            false,
354        )),
355        other => {
356            eprintln!("  Warning: unknown backend '{other}', skipping");
357            None
358        }
359    }
360}
361
362/// Measure AVX2/AVX512 backends: prefer actual CPU bench, else fall back to estimate.
363fn measure_avx_backend(size: u32, actual: Option<(f64, f64)>, avx512: bool) -> (f64, bool, bool) {
364    #[cfg(target_arch = "x86_64")]
365    let avail = if avx512 {
366        std::arch::is_x86_feature_detected!("avx512f")
367    } else {
368        std::arch::is_x86_feature_detected!("avx2")
369    };
370    #[cfg(not(target_arch = "x86_64"))]
371    let avail = false;
372
373    if let Some((actual_us, _)) = actual {
374        return (actual_us, avail, true);
375    }
376    if avx512 {
377        (estimate_avx512_time_us(size), avail, false)
378    } else {
379        (estimate_avx2_time_us(size), avail, false)
380    }
381}
382
383/// Measure CUDA PTX backend when available + `cuda` feature is enabled.
384fn measure_cuda_backend(size: u32) -> (f64, bool, bool) {
385    let avail = which::which("nvidia-smi").is_ok();
386    if avail {
387        if let Some((time_us, _)) = try_measure_ptx(size) {
388            return (time_us, true, true);
389        }
390    }
391    (estimate_cuda_time_us(size), avail, false)
392}
393
394/// Measure cuBLAS backend when available + `cuda` feature is enabled.
395fn measure_cublas_backend(size: u32) -> (f64, bool, bool) {
396    let avail = which::which("nvidia-smi").is_ok();
397    if avail {
398        if let Some((time_us, _)) = try_measure_cublas(size) {
399            return (time_us, true, true);
400        }
401    }
402    (estimate_cublas_time_us(size), avail, false)
403}
404
405/// Shim that returns `measure_ptx_gemm` under `cuda` feature, `None` otherwise.
406#[cfg(feature = "cuda")]
407fn try_measure_ptx(size: u32) -> Option<(f64, f64)> {
408    measure_ptx_gemm(size)
409}
410
411#[cfg(not(feature = "cuda"))]
412fn try_measure_ptx(_size: u32) -> Option<(f64, f64)> {
413    None
414}
415
416/// Shim that returns `measure_cublas_gemm` under `cuda` feature, `None` otherwise.
417#[cfg(feature = "cuda")]
418fn try_measure_cublas(size: u32) -> Option<(f64, f64)> {
419    measure_cublas_gemm(size)
420}
421
422#[cfg(not(feature = "cuda"))]
423fn try_measure_cublas(_size: u32) -> Option<(f64, f64)> {
424    None
425}
426
427/// Render the main comparison table (header + one row per backend).
428fn render_comparison_table(results: &[BackendResult]) {
429    let best_time = results.first().map_or(1.0, |r| r.wall_time_us);
430    println!(
431        "  {:12} {:>12} {:>12} {:>10} {:>10} {:>8} {:>5}",
432        "Backend", "Time (us)", "TFLOP/s", "Efficiency", "vs Best", "Avail", "Src"
433    );
434    println!("  {}", "-".repeat(75));
435    let (cpu_peak, gpu_peak) = peak_performance_limits();
436    for r in results {
437        render_comparison_row(r, best_time, cpu_peak, gpu_peak);
438    }
439}
440
441/// Print a single comparison row with efficiency, ratio, availability, and source tag.
442fn render_comparison_row(r: &BackendResult, best_time: f64, cpu_peak: f64, gpu_peak: f64) {
443    let peak_tflops = if r.name.contains("cuda") || r.name.contains("cublas") || r.name == "wgpu" {
444        gpu_peak / 1e12
445    } else {
446        cpu_peak / 1e12
447    };
448    let efficiency = if peak_tflops > 0.0 {
449        r.tflops / peak_tflops * 100.0
450    } else {
451        0.0
452    };
453    let ratio = format!("{:.2}x", r.wall_time_us / best_time);
454    let avail = if r.available { "yes" } else { "no" };
455    let time_str = if r.wall_time_us >= 1000.0 {
456        format!("{:.1} ms", r.wall_time_us / 1000.0)
457    } else {
458        format!("{:.1}", r.wall_time_us)
459    };
460    let src = if r.measured { "M" } else { "E" };
461    println!(
462        "  {:12} {:>12} {:>12.1} {:>9.1}% {:>10} {:>8} {:>5}",
463        r.name, time_str, r.tflops, efficiency, ratio, avail, src
464    );
465}
466
467/// Roofline-derived peak CPU/GPU FLOP/s (used to compute per-row efficiency).
468fn peak_performance_limits() -> (f64, f64) {
469    let model = RooflineModel::rtx_4090();
470    let gpu_peak = model
471        .peak_compute
472        .get(&Precision::Fp16)
473        .copied()
474        .unwrap_or(330.0e12);
475    let cores = num_cpus::get_physical();
476    #[allow(clippy::cast_precision_loss)]
477    let cpu_peak = 2.0 * 8.0 * 2.0 * 3.5e9 * cores as f64; // AVX2 peak
478    (cpu_peak, gpu_peak)
479}
480
481/// Show the "Src: M=measured E=estimated" legend when any results were produced.
482fn render_source_legend(results: &[BackendResult]) {
483    let has_measured = results.iter().any(|r| r.measured);
484    let has_estimated = results.iter().any(|r| !r.measured);
485    if !(has_measured || has_estimated) {
486        return;
487    }
488    print!("  Src: ");
489    if has_measured {
490        print!("M=measured ");
491    }
492    if has_estimated {
493        print!("E=estimated ");
494    }
495    println!();
496}
497
498/// Print the "Best: X (Ny faster than Z)" summary line when results are present.
499fn render_best_summary(results: &[BackendResult]) {
500    let Some(best) = results.first() else {
501        return;
502    };
503    let Some(worst) = results.last() else {
504        return;
505    };
506    let speedup = worst.wall_time_us / best.wall_time_us;
507    println!(
508        "\n  Best: {} ({:.1}x faster than {})",
509        best.name, speedup, worst.name
510    );
511}
512
513/// Print "CPU→GPU gap: Nx" when both CPU and GPU backends were measured.
514fn render_cpu_gpu_gap(results: &[BackendResult]) {
515    let has_cpu = results
516        .iter()
517        .any(|r| matches!(r.name.as_str(), "scalar" | "avx2" | "avx512"));
518    let has_gpu = results
519        .iter()
520        .any(|r| matches!(r.name.as_str(), "cuda" | "cublas" | "wgpu"));
521    if !(has_cpu && has_gpu) {
522        return;
523    }
524    let best_cpu = results
525        .iter()
526        .filter(|r| matches!(r.name.as_str(), "scalar" | "avx2" | "avx512"))
527        .map(|r| r.wall_time_us)
528        .fold(f64::INFINITY, f64::min);
529    let best_gpu = results
530        .iter()
531        .filter(|r| matches!(r.name.as_str(), "cuda" | "cublas" | "wgpu"))
532        .map(|r| r.wall_time_us)
533        .fold(f64::INFINITY, f64::min);
534    if best_gpu > 0.0 {
535        println!(
536            "  CPU→GPU gap: {:.0}x (expected for large GEMM)",
537            best_cpu / best_gpu
538        );
539    }
540}
541
542#[cfg(test)]
543mod tests {
544    use super::*;
545
546    #[test]
547    fn test_gemm_tflops() {
548        // 512^3 GEMM at 23.2us = 2*512^3 / 23.2e-6 / 1e12
549        let tflops = gemm_tflops(512, 23.2);
550        assert!(
551            (tflops - 11.56).abs() < 0.1,
552            "Expected ~11.6 TFLOP/s, got {tflops:.2}"
553        );
554    }
555
556    #[test]
557    fn test_scalar_slower_than_avx2() {
558        let scalar = estimate_scalar_time_us(512);
559        let avx2 = estimate_avx2_time_us(512);
560        assert!(scalar > avx2 * 3.0, "Scalar should be >3x slower than AVX2");
561    }
562
563    #[test]
564    fn test_cuda_faster_than_cpu() {
565        let cpu = estimate_avx2_time_us(4096);
566        let cuda = estimate_cuda_time_us(4096);
567        assert!(
568            cpu > cuda * 10.0,
569            "CPU should be >10x slower than CUDA for 4096"
570        );
571    }
572
573    /// FALSIFY-CGP-040: CUDA must be faster than scalar for GEMM >= 256.
574    #[test]
575    fn test_cuda_faster_than_scalar_at_256() {
576        let scalar = estimate_scalar_time_us(256);
577        let cuda = estimate_cuda_time_us(256);
578        assert!(cuda < scalar, "CUDA should be faster than scalar at 256");
579    }
580
581    /// FALSIFY-CGP-041: SIMD must be faster than scalar (>= 3x at 1024).
582    #[test]
583    fn test_simd_faster_than_scalar() {
584        let scalar = estimate_scalar_time_us(1024);
585        let avx2 = estimate_avx2_time_us(1024);
586        assert!(
587            scalar / avx2 >= 3.0,
588            "AVX2 speedup {:.1}x should be >= 3x",
589            scalar / avx2
590        );
591    }
592
593    /// FALSIFY-CGP-042: cuBLAS must be faster than pure PTX for large GEMM.
594    #[test]
595    fn test_cublas_faster_than_ptx() {
596        let ptx = estimate_cuda_time_us(4096);
597        let cublas = estimate_cublas_time_us(4096);
598        assert!(cublas < ptx, "cuBLAS should be faster than PTX at 4096");
599    }
600
601    #[test]
602    fn test_run_compare_basic() {
603        let result = run_compare("gemm", 256, "scalar,avx2", false);
604        assert!(result.is_ok());
605    }
606
607    #[test]
608    fn test_run_compare_json() {
609        let result = run_compare("gemm", 256, "scalar,avx2", true);
610        assert!(result.is_ok());
611    }
612
613    /// FALSIFY-CGP-ACTUAL-001: Actual benchmark data is available and parseable.
614    #[test]
615    fn test_get_actual_gemm_timing() {
616        if let Some((time_us, gflops)) = get_actual_gemm_timing(1024) {
617            assert!(time_us > 0.0, "time should be positive");
618            assert!(gflops > 10.0, "GFLOPS should be > 10 for 1024 GEMM");
619            assert!(gflops < 2000.0, "GFLOPS should be < 2000");
620            eprintln!(
621                "Actual GEMM 1024: {:.1} us = {:.0} GFLOPS [MEASURED]",
622                time_us, gflops
623            );
624        } else {
625            eprintln!("benchmark_matrix_suite binary not found — actual data unavailable");
626        }
627    }
628}