1use crate::analysis::roofline::{Precision, RooflineModel};
6use anyhow::Result;
7use serde::Serialize;
8
9#[derive(Debug, Clone, Serialize)]
11pub struct BackendResult {
12 pub name: String,
13 pub wall_time_us: f64,
14 pub tflops: f64,
15 pub bandwidth_gbps: f64,
16 pub available: bool,
17 #[serde(skip_serializing_if = "std::ops::Not::not")]
19 pub measured: bool,
20}
21
22fn gemm_tflops(size: u32, time_us: f64) -> f64 {
24 if time_us <= 0.0 {
25 return 0.0;
26 }
27 let flops = 2.0 * (size as f64).powi(3);
28 flops / (time_us * 1e-6) / 1e12
29}
30
31fn get_actual_gemm_timing(size: u32) -> Option<(f64, f64)> {
34 let candidates = [
35 "/mnt/nvme-raid0/targets/trueno/release/examples/benchmark_matrix_suite",
36 "./target/release/examples/benchmark_matrix_suite",
37 ];
38 let binary_path = candidates
39 .iter()
40 .find(|p| std::path::Path::new(p).exists())?;
41 let output = std::process::Command::new(*binary_path)
42 .stdout(std::process::Stdio::piped())
43 .stderr(std::process::Stdio::piped())
44 .output()
45 .ok()?;
46 if !output.status.success() {
47 return None;
48 }
49 let stdout = String::from_utf8_lossy(&output.stdout);
50 let pattern = format!("Matrix Multiplication ({}x{}x{})", size, size, size);
51
52 for line in stdout.lines() {
53 if line.contains(&pattern) {
54 let after_dots = line.split("...").nth(1)?;
56 let time_ms = after_dots.split("ms").next()?.trim().parse::<f64>().ok()?;
57 let gflops = after_dots
58 .split('(')
59 .nth(1)?
60 .split(" GFLOPS")
61 .next()?
62 .trim()
63 .parse::<f64>()
64 .ok()?;
65 return Some((time_ms * 1000.0, gflops));
66 }
67 }
68 None
69}
70
71fn estimate_scalar_time_us(size: u32) -> f64 {
74 let ratio = (size as f64 / 256.0).powi(3);
76 11_700.0 * ratio
77}
78
79fn estimate_avx2_time_us(size: u32) -> f64 {
82 let flops = 2.0 * (size as f64).powi(3);
84 let gflops = 72.0; flops / (gflops * 1e9) * 1e6
86}
87
88fn estimate_avx512_time_us(size: u32) -> f64 {
91 let flops = 2.0 * (size as f64).powi(3);
92 let gflops = 80.0; flops / (gflops * 1e9) * 1e6
94}
95
96fn estimate_cuda_time_us(size: u32) -> f64 {
99 let ratio = (size as f64 / 512.0).powi(3);
100 23.2 * ratio
101}
102
103fn estimate_cublas_time_us(size: u32) -> f64 {
106 estimate_cuda_time_us(size) / 3.0
107}
108
109#[cfg(feature = "cuda")]
112fn measure_cublas_gemm(size: u32) -> Option<(f64, f64)> {
113 use trueno_gpu::driver::{CublasHandle, CudaContext, CudaStream, GemmOp, GpuBuffer};
114
115 let ctx = CudaContext::new(0).ok()?;
116 let stream = CudaStream::new(&ctx).ok()?;
117 let handle = CublasHandle::new(&ctx).ok()?;
118 handle.set_stream(&stream).ok()?;
119
120 let n = size as usize;
121 let a_data = vec![0x3C00u16; n * n]; let b_data = vec![0x3C00u16; n * n];
123 let c_data = vec![0u16; n * n];
124
125 let a_buf = GpuBuffer::from_host(&ctx, &a_data).ok()?;
126 let b_buf = GpuBuffer::from_host(&ctx, &b_data).ok()?;
127 let c_buf = GpuBuffer::from_host(&ctx, &c_data).ok()?;
128
129 for _ in 0..5 {
131 let _ = handle.gemm_f16(
132 GemmOp::NoTrans,
133 GemmOp::NoTrans,
134 n as i32,
135 n as i32,
136 n as i32,
137 1.0,
138 a_buf.as_ptr(),
139 n as i32,
140 b_buf.as_ptr(),
141 n as i32,
142 0.0,
143 c_buf.as_ptr(),
144 n as i32,
145 );
146 }
147 stream.synchronize().ok()?;
148
149 let iters: u32 = if n <= 512 {
150 200
151 } else if n <= 1024 {
152 100
153 } else {
154 30
155 };
156 let start = std::time::Instant::now();
157 for _ in 0..iters {
158 let _ = handle.gemm_f16(
159 GemmOp::NoTrans,
160 GemmOp::NoTrans,
161 n as i32,
162 n as i32,
163 n as i32,
164 1.0,
165 a_buf.as_ptr(),
166 n as i32,
167 b_buf.as_ptr(),
168 n as i32,
169 0.0,
170 c_buf.as_ptr(),
171 n as i32,
172 );
173 }
174 stream.synchronize().ok()?;
175 let elapsed = start.elapsed();
176
177 let per_call_us = elapsed.as_micros() as f64 / iters as f64;
178 let flops = 2.0 * (n as f64).powi(3);
179 let tflops = flops / (per_call_us * 1e6);
180
181 Some((per_call_us, tflops))
182}
183
184#[cfg(feature = "cuda")]
187fn measure_ptx_gemm(size: u32) -> Option<(f64, f64)> {
188 use std::ffi::c_void;
189 use trueno_gpu::driver::{CudaContext, CudaModule, CudaStream, GpuBuffer, LaunchConfig};
190 use trueno_gpu::kernels::build_cta64x128_mma_pipeline_fp16;
191 use trueno_gpu::ptx::PtxModule;
192
193 let ctx = CudaContext::new(0).ok()?;
194 let stream = CudaStream::new(&ctx).ok()?;
195
196 let n = size as usize;
197 let a16 = vec![0x3C00u16; n * n];
198 let b16 = vec![0x3C00u16; n * n];
199 let c32 = vec![0.0f32; n * n];
200
201 let a_buf = GpuBuffer::from_host(&ctx, &a16).ok()?;
202 let b_buf = GpuBuffer::from_host(&ctx, &b16).ok()?;
203 let c_buf = GpuBuffer::from_host(&ctx, &c32).ok()?;
204
205 let kernel = build_cta64x128_mma_pipeline_fp16(n as u32, n as u32, n as u32);
206 let ptx = PtxModule::new().target("sm_80").add_kernel(kernel).emit();
207 let mut module = CudaModule::from_ptx(&ctx, &ptx).ok()?;
208
209 let cfg = LaunchConfig {
210 grid: (((n + 127) / 128) as u32, ((n + 63) / 64) as u32, 1),
211 block: (512, 1, 1),
212 shared_mem: 18432,
213 };
214
215 let mut a_ptr = a_buf.as_ptr();
216 let mut b_ptr = b_buf.as_ptr();
217 let mut c_ptr = c_buf.as_ptr();
218 let mut m_v = n as u32;
219 let mut n_v = n as u32;
220 let mut k_v = n as u32;
221 let mut args: Vec<*mut c_void> = vec![
222 &mut a_ptr as *mut _ as *mut c_void,
223 &mut b_ptr as *mut _ as *mut c_void,
224 &mut c_ptr as *mut _ as *mut c_void,
225 &mut m_v as *mut _ as *mut c_void,
226 &mut n_v as *mut _ as *mut c_void,
227 &mut k_v as *mut _ as *mut c_void,
228 ];
229
230 for _ in 0..5 {
232 unsafe {
233 stream
234 .launch_kernel(
235 &mut module,
236 "gemm_cta64x128_mma_pipeline_fp16",
237 &cfg,
238 &mut args,
239 )
240 .ok()?;
241 }
242 }
243 stream.synchronize().ok()?;
244
245 let iters: u32 = if n <= 512 {
246 100
247 } else if n <= 1024 {
248 50
249 } else {
250 20
251 };
252 let start = std::time::Instant::now();
253 for _ in 0..iters {
254 unsafe {
255 stream
256 .launch_kernel(
257 &mut module,
258 "gemm_cta64x128_mma_pipeline_fp16",
259 &cfg,
260 &mut args,
261 )
262 .ok()?;
263 }
264 }
265 stream.synchronize().ok()?;
266 let per_call_us = start.elapsed().as_micros() as f64 / iters as f64;
267 let flops = 2.0 * (n as f64).powi(3);
268 let tflops = flops / (per_call_us * 1e6);
269
270 Some((per_call_us, tflops))
271}
272
273pub fn run_compare(kernel: &str, size: u32, backends_str: &str, json: bool) -> Result<()> {
275 let backends: Vec<&str> = backends_str.split(',').map(|s| s.trim()).collect();
276
277 if !json {
278 println!("\n=== CGP Cross-Backend Comparison: {kernel} ({size}x{size}x{size}) ===\n");
279 }
280
281 let mut results: Vec<BackendResult> = Vec::new();
282
283 let actual = get_actual_gemm_timing(size);
285
286 for backend in &backends {
287 let (time_us, available, measured) = match *backend {
288 "scalar" => (estimate_scalar_time_us(size), true, false),
289 "avx2" | "avx512" => {
290 #[cfg(target_arch = "x86_64")]
291 let avail = if *backend == "avx512" {
292 std::arch::is_x86_feature_detected!("avx512f")
293 } else {
294 std::arch::is_x86_feature_detected!("avx2")
295 };
296 #[cfg(not(target_arch = "x86_64"))]
297 let avail = false;
298
299 if let Some((actual_us, _)) = actual {
301 (actual_us, avail, true)
302 } else if *backend == "avx512" {
303 (estimate_avx512_time_us(size), avail, false)
304 } else {
305 (estimate_avx2_time_us(size), avail, false)
306 }
307 }
308 "neon" => {
309 let avail = cfg!(target_arch = "aarch64");
310 (estimate_scalar_time_us(size) / 4.0, avail, false)
311 }
312 "cuda" => {
313 let avail = which::which("nvidia-smi").is_ok();
314 #[cfg(feature = "cuda")]
315 if avail {
316 if let Some((time_us, _tflops)) = measure_ptx_gemm(size) {
317 (time_us, true, true)
318 } else {
319 (estimate_cuda_time_us(size), avail, false)
320 }
321 } else {
322 (estimate_cuda_time_us(size), avail, false)
323 }
324 #[cfg(not(feature = "cuda"))]
325 {
326 (estimate_cuda_time_us(size), avail, false)
327 }
328 }
329 "cublas" => {
330 let avail = which::which("nvidia-smi").is_ok();
331 #[cfg(feature = "cuda")]
333 if avail {
334 if let Some((time_us, _tflops)) = measure_cublas_gemm(size) {
335 (time_us, true, true)
336 } else {
337 (estimate_cublas_time_us(size), avail, false)
338 }
339 } else {
340 (estimate_cublas_time_us(size), avail, false)
341 }
342 #[cfg(not(feature = "cuda"))]
343 {
344 (estimate_cublas_time_us(size), avail, false)
345 }
346 }
347 "wgpu" => {
348 let avail = which::which("nvidia-smi").is_ok();
349 (estimate_cuda_time_us(size) * 2.0, avail, false)
350 }
351 other => {
352 eprintln!(" Warning: unknown backend '{other}', skipping");
353 continue;
354 }
355 };
356
357 let tflops = gemm_tflops(size, time_us);
358
359 results.push(BackendResult {
360 name: backend.to_string(),
361 wall_time_us: time_us,
362 tflops,
363 bandwidth_gbps: 0.0,
364 available,
365 measured,
366 });
367 }
368
369 results.sort_by(|a, b| {
371 a.wall_time_us
372 .partial_cmp(&b.wall_time_us)
373 .unwrap_or(std::cmp::Ordering::Equal)
374 });
375
376 if json {
377 println!("{}", serde_json::to_string_pretty(&results)?);
378 return Ok(());
379 }
380
381 let best_time = results.first().map(|r| r.wall_time_us).unwrap_or(1.0);
382
383 println!(
385 " {:12} {:>12} {:>12} {:>10} {:>10} {:>8} {:>5}",
386 "Backend", "Time (us)", "TFLOP/s", "Efficiency", "vs Best", "Avail", "Src"
387 );
388 println!(" {}", "-".repeat(75));
389
390 let model = RooflineModel::rtx_4090();
392 let gpu_peak = model
393 .peak_compute
394 .get(&Precision::Fp16)
395 .copied()
396 .unwrap_or(330.0e12);
397 let cores = num_cpus::get_physical();
398 let cpu_peak = 2.0 * 8.0 * 2.0 * 3.5e9 * cores as f64; for r in &results {
401 let peak = if r.name.contains("cuda") || r.name.contains("cublas") || r.name == "wgpu" {
402 gpu_peak / 1e12
403 } else {
404 cpu_peak / 1e12
405 };
406 let efficiency = if peak > 0.0 {
407 r.tflops / peak * 100.0
408 } else {
409 0.0
410 };
411 let ratio = format!("{:.2}x", r.wall_time_us / best_time);
412 let avail = if r.available { "yes" } else { "no" };
413
414 let time_str = if r.wall_time_us >= 1000.0 {
415 format!("{:.1} ms", r.wall_time_us / 1000.0)
416 } else {
417 format!("{:.1}", r.wall_time_us)
418 };
419
420 let src = if r.measured { "M" } else { "E" };
421 println!(
422 " {:12} {:>12} {:>12.1} {:>9.1}% {:>10} {:>8} {:>5}",
423 r.name, time_str, r.tflops, efficiency, ratio, avail, src
424 );
425 }
426
427 let has_measured = results.iter().any(|r| r.measured);
428 let has_estimated = results.iter().any(|r| !r.measured);
429 if has_measured || has_estimated {
430 print!(" Src: ");
431 if has_measured {
432 print!("M=measured ");
433 }
434 if has_estimated {
435 print!("E=estimated ");
436 }
437 println!();
438 }
439
440 if let Some(best) = results.first() {
442 if let Some(worst) = results.last() {
443 let speedup = worst.wall_time_us / best.wall_time_us;
444 println!(
445 "\n Best: {} ({:.1}x faster than {})",
446 best.name, speedup, worst.name
447 );
448 }
449 }
450
451 let has_cpu = results
453 .iter()
454 .any(|r| matches!(r.name.as_str(), "scalar" | "avx2" | "avx512"));
455 let has_gpu = results
456 .iter()
457 .any(|r| matches!(r.name.as_str(), "cuda" | "cublas" | "wgpu"));
458 if has_cpu && has_gpu {
459 let best_cpu = results
460 .iter()
461 .filter(|r| matches!(r.name.as_str(), "scalar" | "avx2" | "avx512"))
462 .map(|r| r.wall_time_us)
463 .fold(f64::INFINITY, f64::min);
464 let best_gpu = results
465 .iter()
466 .filter(|r| matches!(r.name.as_str(), "cuda" | "cublas" | "wgpu"))
467 .map(|r| r.wall_time_us)
468 .fold(f64::INFINITY, f64::min);
469 if best_gpu > 0.0 {
470 println!(
471 " CPU→GPU gap: {:.0}x (expected for large GEMM)",
472 best_cpu / best_gpu
473 );
474 }
475 }
476
477 println!();
478 Ok(())
479}
480
481#[cfg(test)]
482mod tests {
483 use super::*;
484
485 #[test]
486 fn test_gemm_tflops() {
487 let tflops = gemm_tflops(512, 23.2);
489 assert!(
490 (tflops - 11.56).abs() < 0.1,
491 "Expected ~11.6 TFLOP/s, got {tflops:.2}"
492 );
493 }
494
495 #[test]
496 fn test_scalar_slower_than_avx2() {
497 let scalar = estimate_scalar_time_us(512);
498 let avx2 = estimate_avx2_time_us(512);
499 assert!(scalar > avx2 * 3.0, "Scalar should be >3x slower than AVX2");
500 }
501
502 #[test]
503 fn test_cuda_faster_than_cpu() {
504 let cpu = estimate_avx2_time_us(4096);
505 let cuda = estimate_cuda_time_us(4096);
506 assert!(
507 cpu > cuda * 10.0,
508 "CPU should be >10x slower than CUDA for 4096"
509 );
510 }
511
512 #[test]
514 fn test_cuda_faster_than_scalar_at_256() {
515 let scalar = estimate_scalar_time_us(256);
516 let cuda = estimate_cuda_time_us(256);
517 assert!(cuda < scalar, "CUDA should be faster than scalar at 256");
518 }
519
520 #[test]
522 fn test_simd_faster_than_scalar() {
523 let scalar = estimate_scalar_time_us(1024);
524 let avx2 = estimate_avx2_time_us(1024);
525 assert!(
526 scalar / avx2 >= 3.0,
527 "AVX2 speedup {:.1}x should be >= 3x",
528 scalar / avx2
529 );
530 }
531
532 #[test]
534 fn test_cublas_faster_than_ptx() {
535 let ptx = estimate_cuda_time_us(4096);
536 let cublas = estimate_cublas_time_us(4096);
537 assert!(cublas < ptx, "cuBLAS should be faster than PTX at 4096");
538 }
539
540 #[test]
541 fn test_run_compare_basic() {
542 let result = run_compare("gemm", 256, "scalar,avx2", false);
543 assert!(result.is_ok());
544 }
545
546 #[test]
547 fn test_run_compare_json() {
548 let result = run_compare("gemm", 256, "scalar,avx2", true);
549 assert!(result.is_ok());
550 }
551
552 #[test]
554 fn test_get_actual_gemm_timing() {
555 if let Some((time_us, gflops)) = get_actual_gemm_timing(1024) {
556 assert!(time_us > 0.0, "time should be positive");
557 assert!(gflops > 10.0, "GFLOPS should be > 10 for 1024 GEMM");
558 assert!(gflops < 2000.0, "GFLOPS should be < 2000");
559 eprintln!(
560 "Actual GEMM 1024: {:.1} us = {:.0} GFLOPS [MEASURED]",
561 time_us, gflops
562 );
563 } else {
564 eprintln!("benchmark_matrix_suite binary not found — actual data unavailable");
565 }
566 }
567}