1use crate::analysis::roofline::{Precision, RooflineModel};
6use anyhow::Result;
7use serde::Serialize;
8
9#[derive(Debug, Clone, Serialize)]
11pub struct BackendResult {
12 pub name: String,
13 pub wall_time_us: f64,
14 pub tflops: f64,
15 pub bandwidth_gbps: f64,
16 pub available: bool,
17 #[serde(skip_serializing_if = "std::ops::Not::not")]
19 pub measured: bool,
20}
21
22fn gemm_tflops(size: u32, time_us: f64) -> f64 {
24 if time_us <= 0.0 {
25 return 0.0;
26 }
27 let flops = 2.0 * (size as f64).powi(3);
28 flops / (time_us * 1e-6) / 1e12
29}
30
31fn get_actual_gemm_timing(size: u32) -> Option<(f64, f64)> {
34 let stdout = run_benchmark_suite()?;
35 let pattern = format!("Matrix Multiplication ({size}x{size}x{size})");
36 stdout
37 .lines()
38 .find(|line| line.contains(&pattern))
39 .and_then(parse_benchmark_line)
40}
41
42fn run_benchmark_suite() -> Option<String> {
44 let candidates = [
45 "/mnt/nvme-raid0/targets/trueno/release/examples/benchmark_matrix_suite",
46 "./target/release/examples/benchmark_matrix_suite",
47 ];
48 let binary_path = candidates
49 .iter()
50 .find(|p| std::path::Path::new(p).exists())?;
51 let output = std::process::Command::new(*binary_path)
52 .stdout(std::process::Stdio::piped())
53 .stderr(std::process::Stdio::piped())
54 .output()
55 .ok()?;
56 if !output.status.success() {
57 return None;
58 }
59 Some(String::from_utf8_lossy(&output.stdout).into_owned())
60}
61
62fn parse_benchmark_line(line: &str) -> Option<(f64, f64)> {
65 let after_dots = line.split("...").nth(1)?;
66 let time_ms = after_dots.split("ms").next()?.trim().parse::<f64>().ok()?;
67 let gflops = after_dots
68 .split('(')
69 .nth(1)?
70 .split(" GFLOPS")
71 .next()?
72 .trim()
73 .parse::<f64>()
74 .ok()?;
75 Some((time_ms * 1000.0, gflops))
76}
77
78fn estimate_scalar_time_us(size: u32) -> f64 {
81 let ratio = (size as f64 / 256.0).powi(3);
83 11_700.0 * ratio
84}
85
86fn estimate_avx2_time_us(size: u32) -> f64 {
89 let flops = 2.0 * (size as f64).powi(3);
91 let gflops = 72.0; flops / (gflops * 1e9) * 1e6
93}
94
95fn estimate_avx512_time_us(size: u32) -> f64 {
98 let flops = 2.0 * (size as f64).powi(3);
99 let gflops = 80.0; flops / (gflops * 1e9) * 1e6
101}
102
103fn estimate_cuda_time_us(size: u32) -> f64 {
106 let ratio = (size as f64 / 512.0).powi(3);
107 23.2 * ratio
108}
109
110fn estimate_cublas_time_us(size: u32) -> f64 {
113 estimate_cuda_time_us(size) / 3.0
114}
115
116#[cfg(feature = "cuda")]
119fn measure_cublas_gemm(size: u32) -> Option<(f64, f64)> {
120 use trueno_gpu::driver::{CublasHandle, CudaContext, CudaStream, GemmOp, GpuBuffer};
121
122 let ctx = CudaContext::new(0).ok()?;
123 let stream = CudaStream::new(&ctx).ok()?;
124 let handle = CublasHandle::new(&ctx).ok()?;
125 handle.set_stream(&stream).ok()?;
126
127 let n = size as usize;
128 let a_data = vec![0x3C00u16; n * n]; let b_data = vec![0x3C00u16; n * n];
130 let c_data = vec![0u16; n * n];
131
132 let a_buf = GpuBuffer::from_host(&ctx, &a_data).ok()?;
133 let b_buf = GpuBuffer::from_host(&ctx, &b_data).ok()?;
134 let c_buf = GpuBuffer::from_host(&ctx, &c_data).ok()?;
135
136 for _ in 0..5 {
138 let _ = handle.gemm_f16(
139 GemmOp::NoTrans,
140 GemmOp::NoTrans,
141 n as i32,
142 n as i32,
143 n as i32,
144 1.0,
145 a_buf.as_ptr(),
146 n as i32,
147 b_buf.as_ptr(),
148 n as i32,
149 0.0,
150 c_buf.as_ptr(),
151 n as i32,
152 );
153 }
154 stream.synchronize().ok()?;
155
156 let iters: u32 = if n <= 512 {
157 200
158 } else if n <= 1024 {
159 100
160 } else {
161 30
162 };
163 let start = std::time::Instant::now();
164 for _ in 0..iters {
165 let _ = handle.gemm_f16(
166 GemmOp::NoTrans,
167 GemmOp::NoTrans,
168 n as i32,
169 n as i32,
170 n as i32,
171 1.0,
172 a_buf.as_ptr(),
173 n as i32,
174 b_buf.as_ptr(),
175 n as i32,
176 0.0,
177 c_buf.as_ptr(),
178 n as i32,
179 );
180 }
181 stream.synchronize().ok()?;
182 let elapsed = start.elapsed();
183
184 let per_call_us = elapsed.as_micros() as f64 / iters as f64;
185 let flops = 2.0 * (n as f64).powi(3);
186 let tflops = flops / (per_call_us * 1e6);
187
188 Some((per_call_us, tflops))
189}
190
191#[cfg(feature = "cuda")]
194fn measure_ptx_gemm(size: u32) -> Option<(f64, f64)> {
195 use std::ffi::c_void;
196 use trueno_gpu::driver::{CudaContext, CudaModule, CudaStream, GpuBuffer, LaunchConfig};
197 use trueno_gpu::kernels::build_cta64x128_mma_pipeline_fp16;
198 use trueno_gpu::ptx::PtxModule;
199
200 let ctx = CudaContext::new(0).ok()?;
201 let stream = CudaStream::new(&ctx).ok()?;
202
203 let n = size as usize;
204 let a16 = vec![0x3C00u16; n * n];
205 let b16 = vec![0x3C00u16; n * n];
206 let c32 = vec![0.0f32; n * n];
207
208 let a_buf = GpuBuffer::from_host(&ctx, &a16).ok()?;
209 let b_buf = GpuBuffer::from_host(&ctx, &b16).ok()?;
210 let c_buf = GpuBuffer::from_host(&ctx, &c32).ok()?;
211
212 let kernel = build_cta64x128_mma_pipeline_fp16(n as u32, n as u32, n as u32);
213 let ptx = PtxModule::new().target("sm_80").add_kernel(kernel).emit();
214 let mut module = CudaModule::from_ptx(&ctx, &ptx).ok()?;
215
216 let cfg = LaunchConfig {
217 grid: (((n + 127) / 128) as u32, ((n + 63) / 64) as u32, 1),
218 block: (512, 1, 1),
219 shared_mem: 18432,
220 };
221
222 let mut a_ptr = a_buf.as_ptr();
223 let mut b_ptr = b_buf.as_ptr();
224 let mut c_ptr = c_buf.as_ptr();
225 let mut m_v = n as u32;
226 let mut n_v = n as u32;
227 let mut k_v = n as u32;
228 let mut args: Vec<*mut c_void> = vec![
229 &mut a_ptr as *mut _ as *mut c_void,
230 &mut b_ptr as *mut _ as *mut c_void,
231 &mut c_ptr as *mut _ as *mut c_void,
232 &mut m_v as *mut _ as *mut c_void,
233 &mut n_v as *mut _ as *mut c_void,
234 &mut k_v as *mut _ as *mut c_void,
235 ];
236
237 for _ in 0..5 {
239 unsafe {
240 stream
241 .launch_kernel(
242 &mut module,
243 "gemm_cta64x128_mma_pipeline_fp16",
244 &cfg,
245 &mut args,
246 )
247 .ok()?;
248 }
249 }
250 stream.synchronize().ok()?;
251
252 let iters: u32 = if n <= 512 {
253 100
254 } else if n <= 1024 {
255 50
256 } else {
257 20
258 };
259 let start = std::time::Instant::now();
260 for _ in 0..iters {
261 unsafe {
262 stream
263 .launch_kernel(
264 &mut module,
265 "gemm_cta64x128_mma_pipeline_fp16",
266 &cfg,
267 &mut args,
268 )
269 .ok()?;
270 }
271 }
272 stream.synchronize().ok()?;
273 let per_call_us = start.elapsed().as_micros() as f64 / iters as f64;
274 let flops = 2.0 * (n as f64).powi(3);
275 let tflops = flops / (per_call_us * 1e6);
276
277 Some((per_call_us, tflops))
278}
279
280pub fn run_compare(kernel: &str, size: u32, backends_str: &str, json: bool) -> Result<()> {
282 let backends: Vec<&str> = backends_str.split(',').map(|s| s.trim()).collect();
283
284 if !json {
285 println!("\n=== CGP Cross-Backend Comparison: {kernel} ({size}x{size}x{size}) ===\n");
286 }
287
288 let actual = get_actual_gemm_timing(size);
289 let mut results = collect_backend_results(&backends, size, actual);
290 results.sort_by(|a, b| {
291 a.wall_time_us
292 .partial_cmp(&b.wall_time_us)
293 .unwrap_or(std::cmp::Ordering::Equal)
294 });
295
296 if json {
297 println!("{}", serde_json::to_string_pretty(&results)?);
298 return Ok(());
299 }
300
301 render_comparison_table(&results);
302 render_source_legend(&results);
303 render_best_summary(&results);
304 render_cpu_gpu_gap(&results);
305
306 println!();
307 Ok(())
308}
309
310fn collect_backend_results(
312 backends: &[&str],
313 size: u32,
314 actual: Option<(f64, f64)>,
315) -> Vec<BackendResult> {
316 let mut results: Vec<BackendResult> = Vec::new();
317 for backend in backends {
318 let Some((time_us, available, measured)) = measure_backend(backend, size, actual) else {
319 continue;
320 };
321 results.push(BackendResult {
322 name: (*backend).to_string(),
323 wall_time_us: time_us,
324 tflops: gemm_tflops(size, time_us),
325 bandwidth_gbps: 0.0,
326 available,
327 measured,
328 });
329 }
330 results
331}
332
333fn measure_backend(
335 backend: &str,
336 size: u32,
337 actual: Option<(f64, f64)>,
338) -> Option<(f64, bool, bool)> {
339 match backend {
340 "scalar" => Some((estimate_scalar_time_us(size), true, false)),
341 "avx2" => Some(measure_avx_backend(size, actual, false)),
342 "avx512" => Some(measure_avx_backend(size, actual, true)),
343 "neon" => Some((
344 estimate_scalar_time_us(size) / 4.0,
345 cfg!(target_arch = "aarch64"),
346 false,
347 )),
348 "cuda" => Some(measure_cuda_backend(size)),
349 "cublas" => Some(measure_cublas_backend(size)),
350 "wgpu" => Some((
351 estimate_cuda_time_us(size) * 2.0,
352 which::which("nvidia-smi").is_ok(),
353 false,
354 )),
355 other => {
356 eprintln!(" Warning: unknown backend '{other}', skipping");
357 None
358 }
359 }
360}
361
362fn measure_avx_backend(size: u32, actual: Option<(f64, f64)>, avx512: bool) -> (f64, bool, bool) {
364 #[cfg(target_arch = "x86_64")]
365 let avail = if avx512 {
366 std::arch::is_x86_feature_detected!("avx512f")
367 } else {
368 std::arch::is_x86_feature_detected!("avx2")
369 };
370 #[cfg(not(target_arch = "x86_64"))]
371 let avail = false;
372
373 if let Some((actual_us, _)) = actual {
374 return (actual_us, avail, true);
375 }
376 if avx512 {
377 (estimate_avx512_time_us(size), avail, false)
378 } else {
379 (estimate_avx2_time_us(size), avail, false)
380 }
381}
382
383fn measure_cuda_backend(size: u32) -> (f64, bool, bool) {
385 let avail = which::which("nvidia-smi").is_ok();
386 if avail {
387 if let Some((time_us, _)) = try_measure_ptx(size) {
388 return (time_us, true, true);
389 }
390 }
391 (estimate_cuda_time_us(size), avail, false)
392}
393
394fn measure_cublas_backend(size: u32) -> (f64, bool, bool) {
396 let avail = which::which("nvidia-smi").is_ok();
397 if avail {
398 if let Some((time_us, _)) = try_measure_cublas(size) {
399 return (time_us, true, true);
400 }
401 }
402 (estimate_cublas_time_us(size), avail, false)
403}
404
405#[cfg(feature = "cuda")]
407fn try_measure_ptx(size: u32) -> Option<(f64, f64)> {
408 measure_ptx_gemm(size)
409}
410
411#[cfg(not(feature = "cuda"))]
412fn try_measure_ptx(_size: u32) -> Option<(f64, f64)> {
413 None
414}
415
416#[cfg(feature = "cuda")]
418fn try_measure_cublas(size: u32) -> Option<(f64, f64)> {
419 measure_cublas_gemm(size)
420}
421
422#[cfg(not(feature = "cuda"))]
423fn try_measure_cublas(_size: u32) -> Option<(f64, f64)> {
424 None
425}
426
427fn render_comparison_table(results: &[BackendResult]) {
429 let best_time = results.first().map_or(1.0, |r| r.wall_time_us);
430 println!(
431 " {:12} {:>12} {:>12} {:>10} {:>10} {:>8} {:>5}",
432 "Backend", "Time (us)", "TFLOP/s", "Efficiency", "vs Best", "Avail", "Src"
433 );
434 println!(" {}", "-".repeat(75));
435 let (cpu_peak, gpu_peak) = peak_performance_limits();
436 for r in results {
437 render_comparison_row(r, best_time, cpu_peak, gpu_peak);
438 }
439}
440
441fn render_comparison_row(r: &BackendResult, best_time: f64, cpu_peak: f64, gpu_peak: f64) {
443 let peak_tflops = if r.name.contains("cuda") || r.name.contains("cublas") || r.name == "wgpu" {
444 gpu_peak / 1e12
445 } else {
446 cpu_peak / 1e12
447 };
448 let efficiency = if peak_tflops > 0.0 {
449 r.tflops / peak_tflops * 100.0
450 } else {
451 0.0
452 };
453 let ratio = format!("{:.2}x", r.wall_time_us / best_time);
454 let avail = if r.available { "yes" } else { "no" };
455 let time_str = if r.wall_time_us >= 1000.0 {
456 format!("{:.1} ms", r.wall_time_us / 1000.0)
457 } else {
458 format!("{:.1}", r.wall_time_us)
459 };
460 let src = if r.measured { "M" } else { "E" };
461 println!(
462 " {:12} {:>12} {:>12.1} {:>9.1}% {:>10} {:>8} {:>5}",
463 r.name, time_str, r.tflops, efficiency, ratio, avail, src
464 );
465}
466
467fn peak_performance_limits() -> (f64, f64) {
469 let model = RooflineModel::rtx_4090();
470 let gpu_peak = model
471 .peak_compute
472 .get(&Precision::Fp16)
473 .copied()
474 .unwrap_or(330.0e12);
475 let cores = num_cpus::get_physical();
476 #[allow(clippy::cast_precision_loss)]
477 let cpu_peak = 2.0 * 8.0 * 2.0 * 3.5e9 * cores as f64; (cpu_peak, gpu_peak)
479}
480
481fn render_source_legend(results: &[BackendResult]) {
483 let has_measured = results.iter().any(|r| r.measured);
484 let has_estimated = results.iter().any(|r| !r.measured);
485 if !(has_measured || has_estimated) {
486 return;
487 }
488 print!(" Src: ");
489 if has_measured {
490 print!("M=measured ");
491 }
492 if has_estimated {
493 print!("E=estimated ");
494 }
495 println!();
496}
497
498fn render_best_summary(results: &[BackendResult]) {
500 let Some(best) = results.first() else {
501 return;
502 };
503 let Some(worst) = results.last() else {
504 return;
505 };
506 let speedup = worst.wall_time_us / best.wall_time_us;
507 println!(
508 "\n Best: {} ({:.1}x faster than {})",
509 best.name, speedup, worst.name
510 );
511}
512
513fn render_cpu_gpu_gap(results: &[BackendResult]) {
515 let has_cpu = results
516 .iter()
517 .any(|r| matches!(r.name.as_str(), "scalar" | "avx2" | "avx512"));
518 let has_gpu = results
519 .iter()
520 .any(|r| matches!(r.name.as_str(), "cuda" | "cublas" | "wgpu"));
521 if !(has_cpu && has_gpu) {
522 return;
523 }
524 let best_cpu = results
525 .iter()
526 .filter(|r| matches!(r.name.as_str(), "scalar" | "avx2" | "avx512"))
527 .map(|r| r.wall_time_us)
528 .fold(f64::INFINITY, f64::min);
529 let best_gpu = results
530 .iter()
531 .filter(|r| matches!(r.name.as_str(), "cuda" | "cublas" | "wgpu"))
532 .map(|r| r.wall_time_us)
533 .fold(f64::INFINITY, f64::min);
534 if best_gpu > 0.0 {
535 println!(
536 " CPU→GPU gap: {:.0}x (expected for large GEMM)",
537 best_cpu / best_gpu
538 );
539 }
540}
541
542#[cfg(test)]
543mod tests {
544 use super::*;
545
546 #[test]
547 fn test_gemm_tflops() {
548 let tflops = gemm_tflops(512, 23.2);
550 assert!(
551 (tflops - 11.56).abs() < 0.1,
552 "Expected ~11.6 TFLOP/s, got {tflops:.2}"
553 );
554 }
555
556 #[test]
557 fn test_scalar_slower_than_avx2() {
558 let scalar = estimate_scalar_time_us(512);
559 let avx2 = estimate_avx2_time_us(512);
560 assert!(scalar > avx2 * 3.0, "Scalar should be >3x slower than AVX2");
561 }
562
563 #[test]
564 fn test_cuda_faster_than_cpu() {
565 let cpu = estimate_avx2_time_us(4096);
566 let cuda = estimate_cuda_time_us(4096);
567 assert!(
568 cpu > cuda * 10.0,
569 "CPU should be >10x slower than CUDA for 4096"
570 );
571 }
572
573 #[test]
575 fn test_cuda_faster_than_scalar_at_256() {
576 let scalar = estimate_scalar_time_us(256);
577 let cuda = estimate_cuda_time_us(256);
578 assert!(cuda < scalar, "CUDA should be faster than scalar at 256");
579 }
580
581 #[test]
583 fn test_simd_faster_than_scalar() {
584 let scalar = estimate_scalar_time_us(1024);
585 let avx2 = estimate_avx2_time_us(1024);
586 assert!(
587 scalar / avx2 >= 3.0,
588 "AVX2 speedup {:.1}x should be >= 3x",
589 scalar / avx2
590 );
591 }
592
593 #[test]
595 fn test_cublas_faster_than_ptx() {
596 let ptx = estimate_cuda_time_us(4096);
597 let cublas = estimate_cublas_time_us(4096);
598 assert!(cublas < ptx, "cuBLAS should be faster than PTX at 4096");
599 }
600
601 #[test]
602 fn test_run_compare_basic() {
603 let result = run_compare("gemm", 256, "scalar,avx2", false);
604 assert!(result.is_ok());
605 }
606
607 #[test]
608 fn test_run_compare_json() {
609 let result = run_compare("gemm", 256, "scalar,avx2", true);
610 assert!(result.is_ok());
611 }
612
613 #[test]
615 fn test_get_actual_gemm_timing() {
616 if let Some((time_us, gflops)) = get_actual_gemm_timing(1024) {
617 assert!(time_us > 0.0, "time should be positive");
618 assert!(gflops > 10.0, "GFLOPS should be > 10 for 1024 GEMM");
619 assert!(gflops < 2000.0, "GFLOPS should be < 2000");
620 eprintln!(
621 "Actual GEMM 1024: {:.1} us = {:.0} GFLOPS [MEASURED]",
622 time_us, gflops
623 );
624 } else {
625 eprintln!("benchmark_matrix_suite binary not found — actual data unavailable");
626 }
627 }
628}