cgp/analysis/
compare.rs

1//! `cgp profile compare` — Cross-backend comparison.
2//! Spec section 2.2: run the same workload across multiple backends
3//! and produce a comparison table with TFLOP/s, bandwidth, and speedup ratios.
4
5use crate::analysis::roofline::{Precision, RooflineModel};
6use anyhow::Result;
7use serde::Serialize;
8
9/// Supported backends for comparison.
10#[derive(Debug, Clone, Serialize)]
11pub struct BackendResult {
12    pub name: String,
13    pub wall_time_us: f64,
14    pub tflops: f64,
15    pub bandwidth_gbps: f64,
16    pub available: bool,
17    /// Whether data comes from actual measurement vs estimation
18    #[serde(skip_serializing_if = "std::ops::Not::not")]
19    pub measured: bool,
20}
21
22/// Compute TFLOP/s for GEMM: 2*M*N*K / time.
23fn gemm_tflops(size: u32, time_us: f64) -> f64 {
24    if time_us <= 0.0 {
25        return 0.0;
26    }
27    let flops = 2.0 * (size as f64).powi(3);
28    flops / (time_us * 1e-6) / 1e12
29}
30
31/// Try to get actual GEMM timing from benchmark_matrix_suite binary.
32/// Returns (time_us, gflops) if the binary exists and the size is benchmarked.
33fn get_actual_gemm_timing(size: u32) -> Option<(f64, f64)> {
34    let candidates = [
35        "/mnt/nvme-raid0/targets/trueno/release/examples/benchmark_matrix_suite",
36        "./target/release/examples/benchmark_matrix_suite",
37    ];
38    let binary_path = candidates
39        .iter()
40        .find(|p| std::path::Path::new(p).exists())?;
41    let output = std::process::Command::new(*binary_path)
42        .stdout(std::process::Stdio::piped())
43        .stderr(std::process::Stdio::piped())
44        .output()
45        .ok()?;
46    if !output.status.success() {
47        return None;
48    }
49    let stdout = String::from_utf8_lossy(&output.stdout);
50    let pattern = format!("Matrix Multiplication ({}x{}x{})", size, size, size);
51
52    for line in stdout.lines() {
53        if line.contains(&pattern) {
54            // Format: "  Matrix Multiplication (NxNxN)...     X.XX ms  (Y.YY GFLOPS)"
55            let after_dots = line.split("...").nth(1)?;
56            let time_ms = after_dots.split("ms").next()?.trim().parse::<f64>().ok()?;
57            let gflops = after_dots
58                .split('(')
59                .nth(1)?
60                .split(" GFLOPS")
61                .next()?
62                .trim()
63                .parse::<f64>()
64                .ok()?;
65            return Some((time_ms * 1000.0, gflops));
66        }
67    }
68    None
69}
70
71/// Estimate scalar GEMM time from measured data on Threadripper 7960X.
72/// Reference GEMM: 256→11.7ms, cubic scaling.
73fn estimate_scalar_time_us(size: u32) -> f64 {
74    // Calibrated: 11.7ms at 256x256 on Threadripper 7960X
75    let ratio = (size as f64 / 256.0).powi(3);
76    11_700.0 * ratio
77}
78
79/// Estimate AVX2 BLIS single-thread GEMM from measured data.
80/// Calibrated: 256→0.57ms, 512→3.75ms, 1024→30.1ms (71 GFLOPS).
81fn estimate_avx2_time_us(size: u32) -> f64 {
82    // BLIS GEMM single-thread: ~72 GFLOPS sustained
83    let flops = 2.0 * (size as f64).powi(3);
84    let gflops = 72.0; // measured on Threadripper 7960X
85    flops / (gflops * 1e9) * 1e6
86}
87
88/// Estimate AVX-512 BLIS GEMM (slightly faster than AVX2, but clock throttle).
89/// ~80 GFLOPS measured single-thread (AVX-512 downclocking limits gains).
90fn estimate_avx512_time_us(size: u32) -> f64 {
91    let flops = 2.0 * (size as f64).powi(3);
92    let gflops = 80.0; // AVX-512 with downclocking ~10% faster than AVX2
93    flops / (gflops * 1e9) * 1e6
94}
95
96/// Estimate CUDA CTA WMMA GEMM from measured data on RTX 4090.
97/// Calibrated: 23.2us at 512x512 = 11.6 TFLOP/s.
98fn estimate_cuda_time_us(size: u32) -> f64 {
99    let ratio = (size as f64 / 512.0).powi(3);
100    23.2 * ratio
101}
102
103/// Estimate cuBLAS GEMM from measured RTX 4090 data.
104/// cuBLAS achieves ~35 TFLOP/s FP16 on RTX 4090 (~3x pure PTX).
105fn estimate_cublas_time_us(size: u32) -> f64 {
106    estimate_cuda_time_us(size) / 3.0
107}
108
109/// Measure actual cuBLAS FP16 GEMM throughput via trueno-gpu driver.
110/// Returns (time_us, tflops) or None if CUDA unavailable.
111#[cfg(feature = "cuda")]
112fn measure_cublas_gemm(size: u32) -> Option<(f64, f64)> {
113    use trueno_gpu::driver::{CublasHandle, CudaContext, CudaStream, GemmOp, GpuBuffer};
114
115    let ctx = CudaContext::new(0).ok()?;
116    let stream = CudaStream::new(&ctx).ok()?;
117    let handle = CublasHandle::new(&ctx).ok()?;
118    handle.set_stream(&stream).ok()?;
119
120    let n = size as usize;
121    let a_data = vec![0x3C00u16; n * n]; // 1.0 in FP16
122    let b_data = vec![0x3C00u16; n * n];
123    let c_data = vec![0u16; n * n];
124
125    let a_buf = GpuBuffer::from_host(&ctx, &a_data).ok()?;
126    let b_buf = GpuBuffer::from_host(&ctx, &b_data).ok()?;
127    let c_buf = GpuBuffer::from_host(&ctx, &c_data).ok()?;
128
129    // Warmup
130    for _ in 0..5 {
131        let _ = handle.gemm_f16(
132            GemmOp::NoTrans,
133            GemmOp::NoTrans,
134            n as i32,
135            n as i32,
136            n as i32,
137            1.0,
138            a_buf.as_ptr(),
139            n as i32,
140            b_buf.as_ptr(),
141            n as i32,
142            0.0,
143            c_buf.as_ptr(),
144            n as i32,
145        );
146    }
147    stream.synchronize().ok()?;
148
149    let iters: u32 = if n <= 512 {
150        200
151    } else if n <= 1024 {
152        100
153    } else {
154        30
155    };
156    let start = std::time::Instant::now();
157    for _ in 0..iters {
158        let _ = handle.gemm_f16(
159            GemmOp::NoTrans,
160            GemmOp::NoTrans,
161            n as i32,
162            n as i32,
163            n as i32,
164            1.0,
165            a_buf.as_ptr(),
166            n as i32,
167            b_buf.as_ptr(),
168            n as i32,
169            0.0,
170            c_buf.as_ptr(),
171            n as i32,
172        );
173    }
174    stream.synchronize().ok()?;
175    let elapsed = start.elapsed();
176
177    let per_call_us = elapsed.as_micros() as f64 / iters as f64;
178    let flops = 2.0 * (n as f64).powi(3);
179    let tflops = flops / (per_call_us * 1e6);
180
181    Some((per_call_us, tflops))
182}
183
184/// Measure our best PTX GEMM kernel (64×128 pipeline) on GPU.
185/// Returns (time_us, tflops) or None if CUDA unavailable.
186#[cfg(feature = "cuda")]
187fn measure_ptx_gemm(size: u32) -> Option<(f64, f64)> {
188    use std::ffi::c_void;
189    use trueno_gpu::driver::{CudaContext, CudaModule, CudaStream, GpuBuffer, LaunchConfig};
190    use trueno_gpu::kernels::build_cta64x128_mma_pipeline_fp16;
191    use trueno_gpu::ptx::PtxModule;
192
193    let ctx = CudaContext::new(0).ok()?;
194    let stream = CudaStream::new(&ctx).ok()?;
195
196    let n = size as usize;
197    let a16 = vec![0x3C00u16; n * n];
198    let b16 = vec![0x3C00u16; n * n];
199    let c32 = vec![0.0f32; n * n];
200
201    let a_buf = GpuBuffer::from_host(&ctx, &a16).ok()?;
202    let b_buf = GpuBuffer::from_host(&ctx, &b16).ok()?;
203    let c_buf = GpuBuffer::from_host(&ctx, &c32).ok()?;
204
205    let kernel = build_cta64x128_mma_pipeline_fp16(n as u32, n as u32, n as u32);
206    let ptx = PtxModule::new().target("sm_80").add_kernel(kernel).emit();
207    let mut module = CudaModule::from_ptx(&ctx, &ptx).ok()?;
208
209    let cfg = LaunchConfig {
210        grid: (((n + 127) / 128) as u32, ((n + 63) / 64) as u32, 1),
211        block: (512, 1, 1),
212        shared_mem: 18432,
213    };
214
215    let mut a_ptr = a_buf.as_ptr();
216    let mut b_ptr = b_buf.as_ptr();
217    let mut c_ptr = c_buf.as_ptr();
218    let mut m_v = n as u32;
219    let mut n_v = n as u32;
220    let mut k_v = n as u32;
221    let mut args: Vec<*mut c_void> = vec![
222        &mut a_ptr as *mut _ as *mut c_void,
223        &mut b_ptr as *mut _ as *mut c_void,
224        &mut c_ptr as *mut _ as *mut c_void,
225        &mut m_v as *mut _ as *mut c_void,
226        &mut n_v as *mut _ as *mut c_void,
227        &mut k_v as *mut _ as *mut c_void,
228    ];
229
230    // Warmup
231    for _ in 0..5 {
232        unsafe {
233            stream
234                .launch_kernel(
235                    &mut module,
236                    "gemm_cta64x128_mma_pipeline_fp16",
237                    &cfg,
238                    &mut args,
239                )
240                .ok()?;
241        }
242    }
243    stream.synchronize().ok()?;
244
245    let iters: u32 = if n <= 512 {
246        100
247    } else if n <= 1024 {
248        50
249    } else {
250        20
251    };
252    let start = std::time::Instant::now();
253    for _ in 0..iters {
254        unsafe {
255            stream
256                .launch_kernel(
257                    &mut module,
258                    "gemm_cta64x128_mma_pipeline_fp16",
259                    &cfg,
260                    &mut args,
261                )
262                .ok()?;
263        }
264    }
265    stream.synchronize().ok()?;
266    let per_call_us = start.elapsed().as_micros() as f64 / iters as f64;
267    let flops = 2.0 * (n as f64).powi(3);
268    let tflops = flops / (per_call_us * 1e6);
269
270    Some((per_call_us, tflops))
271}
272
273/// Run cross-backend comparison.
274pub fn run_compare(kernel: &str, size: u32, backends_str: &str, json: bool) -> Result<()> {
275    let backends: Vec<&str> = backends_str.split(',').map(|s| s.trim()).collect();
276
277    if !json {
278        println!("\n=== CGP Cross-Backend Comparison: {kernel} ({size}x{size}x{size}) ===\n");
279    }
280
281    let mut results: Vec<BackendResult> = Vec::new();
282
283    // Try to get actual benchmark data first (runs once, cached for all CPU backends)
284    let actual = get_actual_gemm_timing(size);
285
286    for backend in &backends {
287        let (time_us, available, measured) = match *backend {
288            "scalar" => (estimate_scalar_time_us(size), true, false),
289            "avx2" | "avx512" => {
290                #[cfg(target_arch = "x86_64")]
291                let avail = if *backend == "avx512" {
292                    std::arch::is_x86_feature_detected!("avx512f")
293                } else {
294                    std::arch::is_x86_feature_detected!("avx2")
295                };
296                #[cfg(not(target_arch = "x86_64"))]
297                let avail = false;
298
299                // Use actual benchmark if available (runs with best SIMD available)
300                if let Some((actual_us, _)) = actual {
301                    (actual_us, avail, true)
302                } else if *backend == "avx512" {
303                    (estimate_avx512_time_us(size), avail, false)
304                } else {
305                    (estimate_avx2_time_us(size), avail, false)
306                }
307            }
308            "neon" => {
309                let avail = cfg!(target_arch = "aarch64");
310                (estimate_scalar_time_us(size) / 4.0, avail, false)
311            }
312            "cuda" => {
313                let avail = which::which("nvidia-smi").is_ok();
314                #[cfg(feature = "cuda")]
315                if avail {
316                    if let Some((time_us, _tflops)) = measure_ptx_gemm(size) {
317                        (time_us, true, true)
318                    } else {
319                        (estimate_cuda_time_us(size), avail, false)
320                    }
321                } else {
322                    (estimate_cuda_time_us(size), avail, false)
323                }
324                #[cfg(not(feature = "cuda"))]
325                {
326                    (estimate_cuda_time_us(size), avail, false)
327                }
328            }
329            "cublas" => {
330                let avail = which::which("nvidia-smi").is_ok();
331                // CGP-DBUF: actual cuBLAS measurement when cuda feature enabled
332                #[cfg(feature = "cuda")]
333                if avail {
334                    if let Some((time_us, _tflops)) = measure_cublas_gemm(size) {
335                        (time_us, true, true)
336                    } else {
337                        (estimate_cublas_time_us(size), avail, false)
338                    }
339                } else {
340                    (estimate_cublas_time_us(size), avail, false)
341                }
342                #[cfg(not(feature = "cuda"))]
343                {
344                    (estimate_cublas_time_us(size), avail, false)
345                }
346            }
347            "wgpu" => {
348                let avail = which::which("nvidia-smi").is_ok();
349                (estimate_cuda_time_us(size) * 2.0, avail, false)
350            }
351            other => {
352                eprintln!("  Warning: unknown backend '{other}', skipping");
353                continue;
354            }
355        };
356
357        let tflops = gemm_tflops(size, time_us);
358
359        results.push(BackendResult {
360            name: backend.to_string(),
361            wall_time_us: time_us,
362            tflops,
363            bandwidth_gbps: 0.0,
364            available,
365            measured,
366        });
367    }
368
369    // Sort by performance (fastest first)
370    results.sort_by(|a, b| {
371        a.wall_time_us
372            .partial_cmp(&b.wall_time_us)
373            .unwrap_or(std::cmp::Ordering::Equal)
374    });
375
376    if json {
377        println!("{}", serde_json::to_string_pretty(&results)?);
378        return Ok(());
379    }
380
381    let best_time = results.first().map(|r| r.wall_time_us).unwrap_or(1.0);
382
383    // Table header
384    println!(
385        "  {:12} {:>12} {:>12} {:>10} {:>10} {:>8} {:>5}",
386        "Backend", "Time (us)", "TFLOP/s", "Efficiency", "vs Best", "Avail", "Src"
387    );
388    println!("  {}", "-".repeat(75));
389
390    // Get roofline for efficiency
391    let model = RooflineModel::rtx_4090();
392    let gpu_peak = model
393        .peak_compute
394        .get(&Precision::Fp16)
395        .copied()
396        .unwrap_or(330.0e12);
397    let cores = num_cpus::get_physical();
398    let cpu_peak = 2.0 * 8.0 * 2.0 * 3.5e9 * cores as f64; // AVX2 peak
399
400    for r in &results {
401        let peak = if r.name.contains("cuda") || r.name.contains("cublas") || r.name == "wgpu" {
402            gpu_peak / 1e12
403        } else {
404            cpu_peak / 1e12
405        };
406        let efficiency = if peak > 0.0 {
407            r.tflops / peak * 100.0
408        } else {
409            0.0
410        };
411        let ratio = format!("{:.2}x", r.wall_time_us / best_time);
412        let avail = if r.available { "yes" } else { "no" };
413
414        let time_str = if r.wall_time_us >= 1000.0 {
415            format!("{:.1} ms", r.wall_time_us / 1000.0)
416        } else {
417            format!("{:.1}", r.wall_time_us)
418        };
419
420        let src = if r.measured { "M" } else { "E" };
421        println!(
422            "  {:12} {:>12} {:>12.1} {:>9.1}% {:>10} {:>8} {:>5}",
423            r.name, time_str, r.tflops, efficiency, ratio, avail, src
424        );
425    }
426
427    let has_measured = results.iter().any(|r| r.measured);
428    let has_estimated = results.iter().any(|r| !r.measured);
429    if has_measured || has_estimated {
430        print!("  Src: ");
431        if has_measured {
432            print!("M=measured ");
433        }
434        if has_estimated {
435            print!("E=estimated ");
436        }
437        println!();
438    }
439
440    // Summary
441    if let Some(best) = results.first() {
442        if let Some(worst) = results.last() {
443            let speedup = worst.wall_time_us / best.wall_time_us;
444            println!(
445                "\n  Best: {} ({:.1}x faster than {})",
446                best.name, speedup, worst.name
447            );
448        }
449    }
450
451    // Show CPU vs GPU gap if both present
452    let has_cpu = results
453        .iter()
454        .any(|r| matches!(r.name.as_str(), "scalar" | "avx2" | "avx512"));
455    let has_gpu = results
456        .iter()
457        .any(|r| matches!(r.name.as_str(), "cuda" | "cublas" | "wgpu"));
458    if has_cpu && has_gpu {
459        let best_cpu = results
460            .iter()
461            .filter(|r| matches!(r.name.as_str(), "scalar" | "avx2" | "avx512"))
462            .map(|r| r.wall_time_us)
463            .fold(f64::INFINITY, f64::min);
464        let best_gpu = results
465            .iter()
466            .filter(|r| matches!(r.name.as_str(), "cuda" | "cublas" | "wgpu"))
467            .map(|r| r.wall_time_us)
468            .fold(f64::INFINITY, f64::min);
469        if best_gpu > 0.0 {
470            println!(
471                "  CPU→GPU gap: {:.0}x (expected for large GEMM)",
472                best_cpu / best_gpu
473            );
474        }
475    }
476
477    println!();
478    Ok(())
479}
480
481#[cfg(test)]
482mod tests {
483    use super::*;
484
485    #[test]
486    fn test_gemm_tflops() {
487        // 512^3 GEMM at 23.2us = 2*512^3 / 23.2e-6 / 1e12
488        let tflops = gemm_tflops(512, 23.2);
489        assert!(
490            (tflops - 11.56).abs() < 0.1,
491            "Expected ~11.6 TFLOP/s, got {tflops:.2}"
492        );
493    }
494
495    #[test]
496    fn test_scalar_slower_than_avx2() {
497        let scalar = estimate_scalar_time_us(512);
498        let avx2 = estimate_avx2_time_us(512);
499        assert!(scalar > avx2 * 3.0, "Scalar should be >3x slower than AVX2");
500    }
501
502    #[test]
503    fn test_cuda_faster_than_cpu() {
504        let cpu = estimate_avx2_time_us(4096);
505        let cuda = estimate_cuda_time_us(4096);
506        assert!(
507            cpu > cuda * 10.0,
508            "CPU should be >10x slower than CUDA for 4096"
509        );
510    }
511
512    /// FALSIFY-CGP-040: CUDA must be faster than scalar for GEMM >= 256.
513    #[test]
514    fn test_cuda_faster_than_scalar_at_256() {
515        let scalar = estimate_scalar_time_us(256);
516        let cuda = estimate_cuda_time_us(256);
517        assert!(cuda < scalar, "CUDA should be faster than scalar at 256");
518    }
519
520    /// FALSIFY-CGP-041: SIMD must be faster than scalar (>= 3x at 1024).
521    #[test]
522    fn test_simd_faster_than_scalar() {
523        let scalar = estimate_scalar_time_us(1024);
524        let avx2 = estimate_avx2_time_us(1024);
525        assert!(
526            scalar / avx2 >= 3.0,
527            "AVX2 speedup {:.1}x should be >= 3x",
528            scalar / avx2
529        );
530    }
531
532    /// FALSIFY-CGP-042: cuBLAS must be faster than pure PTX for large GEMM.
533    #[test]
534    fn test_cublas_faster_than_ptx() {
535        let ptx = estimate_cuda_time_us(4096);
536        let cublas = estimate_cublas_time_us(4096);
537        assert!(cublas < ptx, "cuBLAS should be faster than PTX at 4096");
538    }
539
540    #[test]
541    fn test_run_compare_basic() {
542        let result = run_compare("gemm", 256, "scalar,avx2", false);
543        assert!(result.is_ok());
544    }
545
546    #[test]
547    fn test_run_compare_json() {
548        let result = run_compare("gemm", 256, "scalar,avx2", true);
549        assert!(result.is_ok());
550    }
551
552    /// FALSIFY-CGP-ACTUAL-001: Actual benchmark data is available and parseable.
553    #[test]
554    fn test_get_actual_gemm_timing() {
555        if let Some((time_us, gflops)) = get_actual_gemm_timing(1024) {
556            assert!(time_us > 0.0, "time should be positive");
557            assert!(gflops > 10.0, "GFLOPS should be > 10 for 1024 GEMM");
558            assert!(gflops < 2000.0, "GFLOPS should be < 2000");
559            eprintln!(
560                "Actual GEMM 1024: {:.1} us = {:.0} GFLOPS [MEASURED]",
561                time_us, gflops
562            );
563        } else {
564            eprintln!("benchmark_matrix_suite binary not found — actual data unavailable");
565        }
566    }
567}
cgp/analysis/compare.rs

cgp/analysis/
compare.rs