use std::time::Instant;
use trueno_gpu::kernels::{
AttentionKernel, GemmKernel, Kernel, LayerNormKernel, QuantizeKernel, SoftmaxKernel,
};
fn main() {
let iterations = 1000;
println!("Kernel Generation Performance ({iterations} iterations each)");
println!("═══════════════════════════════════════════════════════════════");
let _ = GemmKernel::naive(32, 32, 32).emit_ptx();
let tests: Vec<(&str, Box<dyn Fn() -> String>)> = vec![
("gemm_naive_64", Box::new(|| GemmKernel::naive(64, 64, 64).emit_ptx())),
("gemm_tiled_128", Box::new(|| GemmKernel::tiled(128, 128, 128, 32).emit_ptx())),
("gemm_tensor_core", Box::new(|| GemmKernel::tensor_core(64, 64, 64).emit_ptx())),
("gemm_wmma_fp16", Box::new(|| GemmKernel::wmma_fp16(64, 64, 64).emit_ptx())),
("softmax_1024", Box::new(|| SoftmaxKernel::new(1024).emit_ptx())),
("layernorm_1024", Box::new(|| LayerNormKernel::new(1024).emit_ptx())),
("attention_64_64", Box::new(|| AttentionKernel::new(64, 64).emit_ptx())),
("q4k_32", Box::new(|| QuantizeKernel::ggml(32, 32, 256).emit_ptx())),
];
for (name, gen_fn) in &tests {
let start = Instant::now();
for _ in 0..iterations {
std::hint::black_box(gen_fn());
}
let elapsed = start.elapsed();
let per_iter_ns = elapsed.as_nanos() as f64 / iterations as f64;
let ptx = gen_fn();
println!(
"{:20} {:8.2} us ({} bytes, {} lines)",
name,
per_iter_ns / 1000.0,
ptx.len(),
ptx.lines().count()
);
}
println!("\n═══════════════════════════════════════════════════════════════");
let start = Instant::now();
let heavy_iterations = 10000;
for _ in 0..heavy_iterations {
std::hint::black_box(GemmKernel::tiled(64, 64, 64, 16).emit_ptx());
}
let elapsed = start.elapsed();
let kernels_per_sec = heavy_iterations as f64 / elapsed.as_secs_f64();
println!("Throughput: {:.0} kernels/sec (gemm_tiled_64)", kernels_per_sec);
}