#![allow(dead_code)]
#![allow(clippy::too_many_arguments)]
use std::time::{Duration, Instant};
use crate::lbm_gpu::{LbmConfig, LbmSimulation};
use crate::sph_gpu::{SphConfig, SphSimulation};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum BackendKind {
Cpu,
Wgpu,
Cuda,
}
impl std::fmt::Display for BackendKind {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Cpu => write!(f, "CPU"),
Self::Wgpu => write!(f, "wgpu"),
Self::Cuda => write!(f, "CUDA"),
}
}
}
#[derive(Debug, Clone)]
pub struct GpuBenchReport {
pub name: String,
pub backend: BackendKind,
pub n: usize,
pub iterations: u32,
pub total: Duration,
pub mean: Duration,
pub mflops: Option<f64>,
}
impl std::fmt::Display for GpuBenchReport {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"[{:<5} {:>20}] n={:>6} mean={:.3}µs",
self.backend,
self.name,
self.n,
self.mean.as_secs_f64() * 1e6
)?;
if let Some(mf) = self.mflops {
write!(f, " {:.1} MFLOPs", mf)?;
}
Ok(())
}
}
pub struct GpuBenchHarness {
pub warmup: u32,
pub iterations: u32,
pub reports: Vec<GpuBenchReport>,
}
impl GpuBenchHarness {
pub fn new() -> Self {
Self {
warmup: 2,
iterations: 5,
reports: Vec::new(),
}
}
pub fn available_backends() -> Vec<BackendKind> {
let mut out = vec![BackendKind::Cpu];
if crate::compute::WgpuBackend::try_new().is_ok() {
out.push(BackendKind::Wgpu);
}
if crate::compute::cuda_backend::CudaBackend::try_new(0).is_ok() {
out.push(BackendKind::Cuda);
}
out
}
pub fn bench_sph_density(&mut self, n: usize) -> Vec<GpuBenchReport> {
let cfg = SphConfig {
n_particles: n,
smoothing_h: 0.1,
rest_density: 1000.0,
gravity: 0.0, domain_min: [-10.; 3],
domain_max: [10.; 3],
..SphConfig::default()
};
let mut out = Vec::new();
{
let mut sim = SphSimulation::new(cfg.clone());
let side = (n as f64).cbrt().ceil() as usize + 1;
for (idx, i) in (0..n).enumerate() {
let x = (idx % side) as f64 * 0.1 - 5.0;
let y = ((idx / side) % side) as f64 * 0.1;
let z = (idx / (side * side)) as f64 * 0.1;
sim.state.pos_x[i] = x;
sim.state.pos_y[i] = y;
sim.state.pos_z[i] = z;
}
for _ in 0..self.warmup {
sim.step(1.0 / 60.0);
}
let t0 = Instant::now();
for _ in 0..self.iterations {
sim.step(1.0 / 60.0);
}
let total = t0.elapsed();
let flops = 10.0 * n as f64 * n as f64;
let mflops = flops / (total.as_secs_f64() / self.iterations as f64) / 1e6;
let r = GpuBenchReport {
name: "sph_density".to_string(),
backend: BackendKind::Cpu,
n,
iterations: self.iterations,
total,
mean: total / self.iterations,
mflops: Some(mflops),
};
out.push(r.clone());
self.reports.push(r);
}
if crate::compute::WgpuBackend::try_new().is_ok() {
let mut sim = SphSimulation::new(cfg.clone());
let side = (n as f64).cbrt().ceil() as usize + 1;
for (idx, i) in (0..n).enumerate() {
sim.state.pos_x[i] = (idx % side) as f64 * 0.1 - 5.0;
sim.state.pos_y[i] = ((idx / side) % side) as f64 * 0.1;
sim.state.pos_z[i] = (idx / (side * side)) as f64 * 0.1;
}
for _ in 0..self.warmup {
sim.step(1.0 / 60.0);
}
let t0 = Instant::now();
for _ in 0..self.iterations {
sim.step(1.0 / 60.0);
}
let total = t0.elapsed();
let backend = if sim.has_gpu() {
BackendKind::Wgpu
} else {
BackendKind::Cpu
};
let flops = 10.0 * n as f64 * n as f64;
let mflops = flops / (total.as_secs_f64() / self.iterations as f64) / 1e6;
let r = GpuBenchReport {
name: "sph_density".to_string(),
backend,
n,
iterations: self.iterations,
total,
mean: total / self.iterations,
mflops: Some(mflops),
};
out.push(r.clone());
self.reports.push(r);
}
out
}
pub fn bench_lbm_step(&mut self, nx: usize, ny: usize, nz: usize) -> Vec<GpuBenchReport> {
let cfg = LbmConfig {
nx,
ny,
nz,
tau: 0.6,
rho0: 1.0,
force_x: 0.0,
force_y: 0.0,
force_z: 0.0,
};
let nc = nx * ny * nz;
let mut out = Vec::new();
{
let mut sim = LbmSimulation::new(cfg.clone());
sim.set_lid_velocity(0.1, 0.0, 0.0);
for _ in 0..self.warmup {
sim.step();
}
let t0 = Instant::now();
for _ in 0..self.iterations {
sim.step();
}
let total = t0.elapsed();
let flops = 120.0 * nc as f64;
let mflops = flops / (total.as_secs_f64() / self.iterations as f64) / 1e6;
let r = GpuBenchReport {
name: format!("lbm_bgk_{}x{}x{}", nx, ny, nz),
backend: BackendKind::Cpu,
n: nc,
iterations: self.iterations,
total,
mean: total / self.iterations,
mflops: Some(mflops),
};
out.push(r.clone());
self.reports.push(r);
}
out
}
pub fn bench_parallel_scan(&mut self, n: usize) -> GpuBenchReport {
let data: Vec<f64> = (0..n).map(|i| i as f64 + 1.0).collect();
for _ in 0..self.warmup {
let _ = inclusive_scan_cpu(&data);
}
let t0 = Instant::now();
let mut result = Vec::new();
for _ in 0..self.iterations {
result = inclusive_scan_cpu(&data);
}
let total = t0.elapsed();
let _ = result;
let flops = 2.0 * n as f64;
let mflops = flops / (total.as_secs_f64() / self.iterations as f64) / 1e6;
let r = GpuBenchReport {
name: "parallel_scan".to_string(),
backend: BackendKind::Cpu,
n,
iterations: self.iterations,
total,
mean: total / self.iterations,
mflops: Some(mflops),
};
self.reports.push(r.clone());
r
}
pub fn run_full_suite(&mut self) -> String {
self.bench_sph_density(64);
self.bench_sph_density(256);
self.bench_lbm_step(8, 8, 8);
self.bench_lbm_step(16, 16, 4);
self.bench_parallel_scan(1024);
self.bench_parallel_scan(65536);
let mut out = format!("{} benchmarks\n", self.reports.len());
for r in &self.reports {
out.push_str(&format!(" {}\n", r));
}
out
}
pub fn cpu_vs_wgpu_comparison(&mut self, n: usize) -> Vec<GpuBenchReport> {
let mut out = Vec::new();
let data: Vec<f64> = (0..n).map(|i| i as f64).collect();
for _ in 0..self.warmup {
let _ = inclusive_scan_cpu(&data);
}
let t0 = std::time::Instant::now();
for _ in 0..self.iterations {
let _ = inclusive_scan_cpu(&data);
}
let total_cpu = t0.elapsed();
let mean_cpu = total_cpu / self.iterations;
let flops = 2.0 * n as f64;
let mflops_cpu = flops / (total_cpu.as_secs_f64() / self.iterations as f64) / 1e6;
let cpu_report = GpuBenchReport {
name: "cpu_copy_scan".to_string(),
backend: BackendKind::Cpu,
n,
iterations: self.iterations,
total: total_cpu,
mean: mean_cpu,
mflops: Some(mflops_cpu),
};
out.push(cpu_report.clone());
self.reports.push(cpu_report);
#[cfg(feature = "wgpu-backend")]
{
use crate::compute::wgpu_backend::real::WgpuBackendReal;
let backend_result = WgpuBackendReal::try_new();
if let Ok(mut backend) = backend_result {
const COPY_WGSL: &str = r#"
@group(0) @binding(0) var<storage, read> in_buf: array<f32>;
@group(0) @binding(1) var<storage, read_write> out_buf: array<f32>;
@compute @workgroup_size(64)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
let i = gid.x;
if (i < arrayLength(&in_buf)) {
out_buf[i] = in_buf[i];
}
}
"#;
let in_buf = backend.create_buffer_f64(n);
let out_buf = backend.create_buffer_f64(n);
backend.write_buffer_f64(in_buf, &data);
let workgroups = WgpuBackendReal::dispatch_count_for(n, 64);
for _ in 0..self.warmup {
let _ = backend.dispatch_wgsl(
COPY_WGSL,
"main",
&[
(in_buf, wgpu::BufferBindingType::Storage { read_only: true }),
(
out_buf,
wgpu::BufferBindingType::Storage { read_only: false },
),
],
workgroups,
);
}
let t0 = std::time::Instant::now();
for _ in 0..self.iterations {
let _ = backend.dispatch_wgsl(
COPY_WGSL,
"main",
&[
(in_buf, wgpu::BufferBindingType::Storage { read_only: true }),
(
out_buf,
wgpu::BufferBindingType::Storage { read_only: false },
),
],
workgroups,
);
}
let total_wgpu = t0.elapsed();
let mean_wgpu = total_wgpu / self.iterations;
let mflops_wgpu = flops / (total_wgpu.as_secs_f64() / self.iterations as f64) / 1e6;
let wgpu_report = GpuBenchReport {
name: "wgpu_copy_dispatch".to_string(),
backend: BackendKind::Wgpu,
n,
iterations: self.iterations,
total: total_wgpu,
mean: mean_wgpu,
mflops: Some(mflops_wgpu),
};
out.push(wgpu_report.clone());
self.reports.push(wgpu_report);
}
}
out
}
pub fn cpu_vs_wgpu_sph(&mut self, n: usize) -> Vec<GpuBenchReport> {
let cfg = SphConfig {
n_particles: n,
smoothing_h: 0.1,
rest_density: 1000.0,
gravity: 0.0,
domain_min: [-10.; 3],
domain_max: [10.; 3],
..SphConfig::default()
};
let mut out = Vec::new();
{
let mut sim = SphSimulation::new(cfg.clone());
let side = (n as f64).cbrt().ceil() as usize + 1;
for (idx, i) in (0..n).enumerate() {
let x = (idx % side) as f64 * 0.1 - 5.0;
let y = ((idx / side) % side) as f64 * 0.1;
let z = (idx / (side * side)) as f64 * 0.1;
sim.state.pos_x[i] = x;
sim.state.pos_y[i] = y;
sim.state.pos_z[i] = z;
}
for _ in 0..self.warmup {
sim.step(1.0 / 60.0);
}
let t0 = Instant::now();
for _ in 0..self.iterations {
sim.step(1.0 / 60.0);
}
let total = t0.elapsed();
let r = GpuBenchReport {
name: "sph_density_cpu".to_string(),
backend: BackendKind::Cpu,
n,
iterations: self.iterations,
total,
mean: total / self.iterations,
mflops: None,
};
out.push(r.clone());
self.reports.push(r);
}
if crate::compute::WgpuBackend::try_new().is_ok() {
let mut sim = SphSimulation::new(cfg.clone());
let side = (n as f64).cbrt().ceil() as usize + 1;
for (idx, i) in (0..n).enumerate() {
sim.state.pos_x[i] = (idx % side) as f64 * 0.1 - 5.0;
sim.state.pos_y[i] = ((idx / side) % side) as f64 * 0.1;
sim.state.pos_z[i] = (idx / (side * side)) as f64 * 0.1;
}
for _ in 0..self.warmup {
sim.step(1.0 / 60.0);
}
let t0 = Instant::now();
for _ in 0..self.iterations {
sim.step(1.0 / 60.0);
}
let total = t0.elapsed();
let backend = if sim.has_gpu() {
BackendKind::Wgpu
} else {
BackendKind::Cpu
};
let r = GpuBenchReport {
name: "sph_density_wgpu".to_string(),
backend,
n,
iterations: self.iterations,
total,
mean: total / self.iterations,
mflops: None,
};
out.push(r.clone());
self.reports.push(r);
}
out
}
pub fn cpu_vs_cuda_sph(&mut self, n: usize) -> Vec<GpuBenchReport> {
let cfg = crate::sph_gpu::SphConfig {
n_particles: n,
smoothing_h: 0.1,
rest_density: 1000.0,
gravity: 0.0,
domain_min: [-10.; 3],
domain_max: [10.; 3],
..crate::sph_gpu::SphConfig::default()
};
let mut out = Vec::new();
{
let mut sim = crate::sph_gpu::SphSimulation::new(cfg.clone());
let side = (n as f64).cbrt().ceil() as usize + 1;
for idx in 0..n {
let x = (idx % side) as f64 * 0.1 - 5.0;
let y = ((idx / side) % side) as f64 * 0.1;
let z = (idx / (side * side)) as f64 * 0.1;
sim.state.pos_x[idx] = x;
sim.state.pos_y[idx] = y;
sim.state.pos_z[idx] = z;
}
for _ in 0..self.warmup {
sim.step(1.0 / 60.0);
}
let t0 = Instant::now();
for _ in 0..self.iterations {
sim.step(1.0 / 60.0);
}
let total = t0.elapsed();
let r = GpuBenchReport {
name: "cuda_sph_density_cpu".to_string(),
backend: BackendKind::Cpu,
n,
iterations: self.iterations,
total,
mean: total / self.iterations,
mflops: None,
};
out.push(r.clone());
self.reports.push(r);
}
#[cfg(feature = "cuda-backend")]
{
use crate::compute::cuda_backend::{CUDA_SPH_DENSITY_SRC, CudaBackend};
if let Ok(mut backend) = CudaBackend::try_new(0) {
let compiled =
backend.compile_and_register("sph_density_kernel", CUDA_SPH_DENSITY_SRC);
if compiled.is_ok() {
let side = (n as f64).cbrt().ceil() as usize + 1;
let mut positions = vec![0.0_f64; n * 3];
for idx in 0..n {
positions[3 * idx] = (idx % side) as f64 * 0.1 - 5.0;
positions[3 * idx + 1] = ((idx / side) % side) as f64 * 0.1;
positions[3 * idx + 2] = (idx / (side * side)) as f64 * 0.1;
}
let pos_buf = backend.create_buffer(n * 3);
let den_buf = backend.create_buffer(n);
backend.write_buffer(pos_buf, &positions);
let block_x: u32 = 256;
let grid_x = (n as u32).div_ceil(block_x);
let n_i32 = [n as i32];
let scalars_f64 = [
cfg.smoothing_h,
if cfg.particle_mass > 0.0 {
cfg.particle_mass
} else {
1.0
},
];
for _ in 0..self.warmup {
backend.launch_with_scalars(
"sph_density_kernel",
&[pos_buf, den_buf],
&n_i32,
&scalars_f64,
grid_x,
block_x,
);
backend.synchronize();
}
let t0 = Instant::now();
for _ in 0..self.iterations {
backend.launch_with_scalars(
"sph_density_kernel",
&[pos_buf, den_buf],
&n_i32,
&scalars_f64,
grid_x,
block_x,
);
backend.synchronize();
}
let total = t0.elapsed();
let r = GpuBenchReport {
name: "cuda_sph_density_gpu".to_string(),
backend: BackendKind::Cuda,
n,
iterations: self.iterations,
total,
mean: total / self.iterations,
mflops: None,
};
out.push(r.clone());
self.reports.push(r);
}
}
}
out
}
pub fn print_comparison(&self) {
println!("\n{:=<75}", "");
println!(
"{:<5} {:<22} {:>8} {:>12} {:>10}",
"Back", "Kernel", "N", "Mean (µs)", "MFLOPs"
);
println!("{:=<75}", "");
for r in &self.reports {
let mf = r.mflops.map_or("—".to_string(), |m| format!("{:.1}", m));
println!(
"{:<5} {:<22} {:>8} {:>12.3} {:>10}",
r.backend,
r.name,
r.n,
r.mean.as_secs_f64() * 1e6,
mf
);
}
println!("{:=<75}", "");
}
}
impl Default for GpuBenchHarness {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
pub struct SpeedupReport {
pub cpu_mean: Duration,
pub wgpu_mean: Option<Duration>,
pub speedup: Option<f64>,
}
pub fn compute_speedup(reports: &[GpuBenchReport]) -> SpeedupReport {
let cpu_mean = reports.first().map(|r| r.mean).unwrap_or(Duration::ZERO);
let wgpu_mean = reports.get(1).map(|r| r.mean);
let speedup = wgpu_mean.map(|wm| {
if wm.as_secs_f64() > 0.0 {
cpu_mean.as_secs_f64() / wm.as_secs_f64()
} else {
f64::INFINITY
}
});
SpeedupReport {
cpu_mean,
wgpu_mean,
speedup,
}
}
#[derive(Debug, Clone)]
pub struct CudaSpeedupReport {
pub cpu_mean: Duration,
pub cuda_mean: Option<Duration>,
pub speedup: Option<f64>,
}
pub fn compute_cuda_speedup(reports: &[GpuBenchReport]) -> CudaSpeedupReport {
let cpu_mean = reports.first().map(|r| r.mean).unwrap_or(Duration::ZERO);
let cuda_mean = reports.get(1).map(|r| r.mean);
let speedup = cuda_mean.map(|cm| {
if cm.as_secs_f64() > 0.0 {
cpu_mean.as_secs_f64() / cm.as_secs_f64()
} else {
f64::INFINITY
}
});
CudaSpeedupReport {
cpu_mean,
cuda_mean,
speedup,
}
}
pub fn inclusive_scan_cpu(data: &[f64]) -> Vec<f64> {
let mut out = Vec::with_capacity(data.len());
let mut acc = 0.0_f64;
for &v in data {
acc += v;
out.push(acc);
}
out
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_available_backends_has_cpu() {
let b = GpuBenchHarness::available_backends();
assert!(b.contains(&BackendKind::Cpu));
}
#[test]
fn test_inclusive_scan() {
let data = vec![1.0, 2.0, 3.0, 4.0];
let out = inclusive_scan_cpu(&data);
assert_eq!(out, vec![1.0, 3.0, 6.0, 10.0]);
}
#[test]
fn test_bench_sph_density_returns_at_least_cpu() {
let mut h = GpuBenchHarness {
warmup: 0,
iterations: 1,
reports: Vec::new(),
};
let reports = h.bench_sph_density(8);
assert!(!reports.is_empty());
assert_eq!(reports[0].backend, BackendKind::Cpu);
}
#[test]
fn test_bench_lbm_step() {
let mut h = GpuBenchHarness {
warmup: 0,
iterations: 1,
reports: Vec::new(),
};
let reports = h.bench_lbm_step(4, 4, 4);
assert_eq!(reports.len(), 1);
assert_eq!(reports[0].n, 64);
}
#[test]
fn test_bench_parallel_scan() {
let mut h = GpuBenchHarness {
warmup: 0,
iterations: 1,
reports: Vec::new(),
};
let r = h.bench_parallel_scan(100);
assert_eq!(r.n, 100);
assert!(r.mflops.is_some());
}
#[test]
fn test_run_full_suite() {
let mut h = GpuBenchHarness {
warmup: 0,
iterations: 1,
reports: Vec::new(),
};
let summary = h.run_full_suite();
assert!(summary.contains("benchmarks"));
}
#[test]
fn test_backend_display() {
assert_eq!(format!("{}", BackendKind::Cpu), "CPU");
assert_eq!(format!("{}", BackendKind::Wgpu), "wgpu");
assert_eq!(format!("{}", BackendKind::Cuda), "CUDA");
}
}