oxiphysics_gpu/
gpu_bench.rs

1// Copyright 2026 COOLJAPAN OU (Team KitaSan)
2// SPDX-License-Identifier: Apache-2.0
3
4//! Performance benchmarks comparing CPU, wgpu, and CUDA backends.
5//!
6//! This module measures throughput for the core compute kernels (SPH density,
7//! LBM collision, parallel scan) across available backends and reports
8//! wall-clock timing and effective GFLOP/s estimates.
9//!
10//! # Quick usage
11//!
12//! ```
13//! use oxiphysics_gpu::gpu_bench::{GpuBenchHarness, BackendKind};
14//!
15//! let mut h = GpuBenchHarness::new();
16//!
17//! // Benchmark SPH density summation for 256 particles
18//! let reports = h.bench_sph_density(256);
19//! for r in &reports {
20//!     println!("{}", r);
21//! }
22//!
23//! // Compare available backends
24//! let available = GpuBenchHarness::available_backends();
25//! assert!(available.contains(&BackendKind::Cpu));
26//! ```
27
28use std::time::{Duration, Instant};
29
30use crate::lbm_gpu::{LbmConfig, LbmSimulation};
31use crate::sph_gpu::{SphConfig, SphSimulation};
32
33// ── BackendKind ───────────────────────────────────────────────────────────────
34
35/// Identifies a compute backend for benchmark reporting.
36#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
37pub enum BackendKind {
38    /// CPU (Rayon-parallel fallback).
39    Cpu,
40    /// wgpu (WebGPU compute shaders).
41    Wgpu,
42    /// CUDA via cudarc.
43    Cuda,
44}
45
46impl std::fmt::Display for BackendKind {
47    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
48        match self {
49            Self::Cpu => write!(f, "CPU"),
50            Self::Wgpu => write!(f, "wgpu"),
51            Self::Cuda => write!(f, "CUDA"),
52        }
53    }
54}
55
56// ── GpuBenchReport ────────────────────────────────────────────────────────────
57
58/// Result of a single GPU benchmark run.
59#[derive(Debug, Clone)]
60pub struct GpuBenchReport {
61    /// Kernel / benchmark name.
62    pub name: String,
63    /// Which backend was measured.
64    pub backend: BackendKind,
65    /// Problem size (particles, cells, …).
66    pub n: usize,
67    /// Number of timed iterations.
68    pub iterations: u32,
69    /// Total wall-clock time.
70    pub total: Duration,
71    /// Mean time per iteration.
72    pub mean: Duration,
73    /// Estimated throughput (MFLOP/s or Mparticles/s depending on kernel).
74    pub mflops: Option<f64>,
75}
76
77impl std::fmt::Display for GpuBenchReport {
78    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
79        write!(
80            f,
81            "[{:<5} {:>20}] n={:>6} mean={:.3}µs",
82            self.backend,
83            self.name,
84            self.n,
85            self.mean.as_secs_f64() * 1e6
86        )?;
87        if let Some(mf) = self.mflops {
88            write!(f, "  {:.1} MFLOPs", mf)?;
89        }
90        Ok(())
91    }
92}
93
94// ── GpuBenchHarness ───────────────────────────────────────────────────────────
95
96/// Timing harness for GPU/CPU backend comparison benchmarks.
97pub struct GpuBenchHarness {
98    /// Warm-up iterations (not timed).
99    pub warmup: u32,
100    /// Timed iterations.
101    pub iterations: u32,
102    /// Collected reports.
103    pub reports: Vec<GpuBenchReport>,
104}
105
106impl GpuBenchHarness {
107    /// Create a harness with 2 warm-up and 5 timed iterations.
108    pub fn new() -> Self {
109        Self {
110            warmup: 2,
111            iterations: 5,
112            reports: Vec::new(),
113        }
114    }
115
116    /// Return which backends are available in this build.
117    ///
118    /// CPU is always available.  wgpu and CUDA depend on feature flags and
119    /// device availability (they appear in the list only when initialisation
120    /// succeeds).
121    pub fn available_backends() -> Vec<BackendKind> {
122        let mut out = vec![BackendKind::Cpu];
123
124        // Try wgpu — succeeds when the GPU driver is present.
125        if crate::compute::WgpuBackend::try_new().is_ok() {
126            out.push(BackendKind::Wgpu);
127        }
128
129        // Try CUDA — this is a stub that always reports unavailable.
130        if crate::compute::cuda_backend::CudaBackend::try_new(0).is_ok() {
131            out.push(BackendKind::Cuda);
132        }
133
134        out
135    }
136
137    // ── SPH density benchmark ─────────────────────────────────────────────────
138
139    /// Benchmark SPH density summation for `n` particles on all available backends.
140    ///
141    /// Each particle's density is recomputed from scratch each call to avoid
142    /// caching effects.  FLOPs estimated as 10 × N² (distance + kernel eval).
143    pub fn bench_sph_density(&mut self, n: usize) -> Vec<GpuBenchReport> {
144        let cfg = SphConfig {
145            n_particles: n,
146            smoothing_h: 0.1,
147            rest_density: 1000.0,
148            gravity: 0.0, // no gravity — pure density bench
149            domain_min: [-10.; 3],
150            domain_max: [10.; 3],
151            ..SphConfig::default()
152        };
153
154        let mut out = Vec::new();
155
156        // ── CPU path ──────────────────────────────────────────────────────────
157        {
158            // Build a CPU-only sim (no GPU backend will be tried)
159            let mut sim = SphSimulation::new(cfg.clone());
160            // Scatter particles in a regular grid
161            let side = (n as f64).cbrt().ceil() as usize + 1;
162            for (idx, i) in (0..n).enumerate() {
163                let x = (idx % side) as f64 * 0.1 - 5.0;
164                let y = ((idx / side) % side) as f64 * 0.1;
165                let z = (idx / (side * side)) as f64 * 0.1;
166                sim.state.pos_x[i] = x;
167                sim.state.pos_y[i] = y;
168                sim.state.pos_z[i] = z;
169            }
170
171            // Warm-up
172            for _ in 0..self.warmup {
173                sim.step(1.0 / 60.0);
174            }
175
176            let t0 = Instant::now();
177            for _ in 0..self.iterations {
178                sim.step(1.0 / 60.0);
179            }
180            let total = t0.elapsed();
181
182            let flops = 10.0 * n as f64 * n as f64;
183            let mflops = flops / (total.as_secs_f64() / self.iterations as f64) / 1e6;
184
185            let r = GpuBenchReport {
186                name: "sph_density".to_string(),
187                backend: BackendKind::Cpu,
188                n,
189                iterations: self.iterations,
190                total,
191                mean: total / self.iterations,
192                mflops: Some(mflops),
193            };
194            out.push(r.clone());
195            self.reports.push(r);
196        }
197
198        // ── wgpu path (if available) ──────────────────────────────────────────
199        if crate::compute::WgpuBackend::try_new().is_ok() {
200            let mut sim = SphSimulation::new(cfg.clone());
201            let side = (n as f64).cbrt().ceil() as usize + 1;
202            for (idx, i) in (0..n).enumerate() {
203                sim.state.pos_x[i] = (idx % side) as f64 * 0.1 - 5.0;
204                sim.state.pos_y[i] = ((idx / side) % side) as f64 * 0.1;
205                sim.state.pos_z[i] = (idx / (side * side)) as f64 * 0.1;
206            }
207
208            for _ in 0..self.warmup {
209                sim.step(1.0 / 60.0);
210            }
211            let t0 = Instant::now();
212            for _ in 0..self.iterations {
213                sim.step(1.0 / 60.0);
214            }
215            let total = t0.elapsed();
216
217            let backend = if sim.has_gpu() {
218                BackendKind::Wgpu
219            } else {
220                BackendKind::Cpu
221            };
222            let flops = 10.0 * n as f64 * n as f64;
223            let mflops = flops / (total.as_secs_f64() / self.iterations as f64) / 1e6;
224
225            let r = GpuBenchReport {
226                name: "sph_density".to_string(),
227                backend,
228                n,
229                iterations: self.iterations,
230                total,
231                mean: total / self.iterations,
232                mflops: Some(mflops),
233            };
234            out.push(r.clone());
235            self.reports.push(r);
236        }
237
238        out
239    }
240
241    // ── LBM benchmark ─────────────────────────────────────────────────────────
242
243    /// Benchmark one LBM BGK step on an `nx × ny × nz` domain.
244    ///
245    /// FLOPs estimated as 120 × nc (19 distribution reads + BGK + streaming).
246    pub fn bench_lbm_step(&mut self, nx: usize, ny: usize, nz: usize) -> Vec<GpuBenchReport> {
247        let cfg = LbmConfig {
248            nx,
249            ny,
250            nz,
251            tau: 0.6,
252            rho0: 1.0,
253            force_x: 0.0,
254            force_y: 0.0,
255            force_z: 0.0,
256        };
257        let nc = nx * ny * nz;
258        let mut out = Vec::new();
259
260        // CPU path
261        {
262            let mut sim = LbmSimulation::new(cfg.clone());
263            sim.set_lid_velocity(0.1, 0.0, 0.0);
264
265            for _ in 0..self.warmup {
266                sim.step();
267            }
268            let t0 = Instant::now();
269            for _ in 0..self.iterations {
270                sim.step();
271            }
272            let total = t0.elapsed();
273
274            let flops = 120.0 * nc as f64;
275            let mflops = flops / (total.as_secs_f64() / self.iterations as f64) / 1e6;
276
277            let r = GpuBenchReport {
278                name: format!("lbm_bgk_{}x{}x{}", nx, ny, nz),
279                backend: BackendKind::Cpu,
280                n: nc,
281                iterations: self.iterations,
282                total,
283                mean: total / self.iterations,
284                mflops: Some(mflops),
285            };
286            out.push(r.clone());
287            self.reports.push(r);
288        }
289
290        out
291    }
292
293    // ── Particle scan benchmark ───────────────────────────────────────────────
294
295    /// Benchmark parallel prefix scan on `n` f64 elements (CPU Rayon scan).
296    ///
297    /// FLOPs = 2n (N adds in up-sweep + N adds in down-sweep).
298    pub fn bench_parallel_scan(&mut self, n: usize) -> GpuBenchReport {
299        let data: Vec<f64> = (0..n).map(|i| i as f64 + 1.0).collect();
300
301        for _ in 0..self.warmup {
302            let _ = inclusive_scan_cpu(&data);
303        }
304        let t0 = Instant::now();
305        let mut result = Vec::new();
306        for _ in 0..self.iterations {
307            result = inclusive_scan_cpu(&data);
308        }
309        let total = t0.elapsed();
310        let _ = result;
311
312        let flops = 2.0 * n as f64;
313        let mflops = flops / (total.as_secs_f64() / self.iterations as f64) / 1e6;
314
315        let r = GpuBenchReport {
316            name: "parallel_scan".to_string(),
317            backend: BackendKind::Cpu,
318            n,
319            iterations: self.iterations,
320            total,
321            mean: total / self.iterations,
322            mflops: Some(mflops),
323        };
324        self.reports.push(r.clone());
325        r
326    }
327
328    // ── Full suite ────────────────────────────────────────────────────────────
329
330    /// Run the complete GPU benchmark suite and return a formatted summary.
331    ///
332    /// ```
333    /// use oxiphysics_gpu::gpu_bench::GpuBenchHarness;
334    /// let mut h = GpuBenchHarness::new();
335    /// let summary = h.run_full_suite();
336    /// assert!(!summary.is_empty());
337    /// ```
338    pub fn run_full_suite(&mut self) -> String {
339        self.bench_sph_density(64);
340        self.bench_sph_density(256);
341        self.bench_lbm_step(8, 8, 8);
342        self.bench_lbm_step(16, 16, 4);
343        self.bench_parallel_scan(1024);
344        self.bench_parallel_scan(65536);
345
346        let mut out = format!("{} benchmarks\n", self.reports.len());
347        for r in &self.reports {
348            out.push_str(&format!("  {}\n", r));
349        }
350        out
351    }
352
353    /// Benchmark CPU inclusive scan vs wgpu copy dispatch for `n` f64 elements.
354    ///
355    /// Both sides operate on the same data (a ramp of 0.0..n).  The wgpu side
356    /// dispatches a copy shader (since f32 on-device means scan parity is a
357    /// different test).  Returns a `Vec` with one CPU report, and optionally one
358    /// wgpu report if an adapter is available.
359    ///
360    /// If no GPU adapter is present, only the CPU report is returned (no panic).
361    ///
362    /// ```
363    /// use oxiphysics_gpu::gpu_bench::GpuBenchHarness;
364    /// let mut h = GpuBenchHarness::new();
365    /// let reports = h.cpu_vs_wgpu_comparison(1000);
366    /// assert!(!reports.is_empty());
367    /// assert_eq!(reports[0].name, "cpu_copy_scan");
368    /// ```
369    pub fn cpu_vs_wgpu_comparison(&mut self, n: usize) -> Vec<GpuBenchReport> {
370        let mut out = Vec::new();
371
372        // ── CPU path: inclusive scan ──────────────────────────────────────────
373        let data: Vec<f64> = (0..n).map(|i| i as f64).collect();
374        for _ in 0..self.warmup {
375            let _ = inclusive_scan_cpu(&data);
376        }
377        let t0 = std::time::Instant::now();
378        for _ in 0..self.iterations {
379            let _ = inclusive_scan_cpu(&data);
380        }
381        let total_cpu = t0.elapsed();
382        let mean_cpu = total_cpu / self.iterations;
383        let flops = 2.0 * n as f64;
384        let mflops_cpu = flops / (total_cpu.as_secs_f64() / self.iterations as f64) / 1e6;
385
386        let cpu_report = GpuBenchReport {
387            name: "cpu_copy_scan".to_string(),
388            backend: BackendKind::Cpu,
389            n,
390            iterations: self.iterations,
391            total: total_cpu,
392            mean: mean_cpu,
393            mflops: Some(mflops_cpu),
394        };
395        out.push(cpu_report.clone());
396        self.reports.push(cpu_report);
397
398        // ── wgpu path (feature-gated) ─────────────────────────────────────────
399        #[cfg(feature = "wgpu-backend")]
400        {
401            use crate::compute::wgpu_backend::real::WgpuBackendReal;
402
403            let backend_result = WgpuBackendReal::try_new();
404            if let Ok(mut backend) = backend_result {
405                const COPY_WGSL: &str = r#"
406@group(0) @binding(0) var<storage, read>       in_buf:  array<f32>;
407@group(0) @binding(1) var<storage, read_write> out_buf: array<f32>;
408
409@compute @workgroup_size(64)
410fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
411    let i = gid.x;
412    if (i < arrayLength(&in_buf)) {
413        out_buf[i] = in_buf[i];
414    }
415}
416"#;
417                let in_buf = backend.create_buffer_f64(n);
418                let out_buf = backend.create_buffer_f64(n);
419                backend.write_buffer_f64(in_buf, &data);
420
421                let workgroups = WgpuBackendReal::dispatch_count_for(n, 64);
422
423                // Warm-up
424                for _ in 0..self.warmup {
425                    let _ = backend.dispatch_wgsl(
426                        COPY_WGSL,
427                        "main",
428                        &[
429                            (in_buf, wgpu::BufferBindingType::Storage { read_only: true }),
430                            (
431                                out_buf,
432                                wgpu::BufferBindingType::Storage { read_only: false },
433                            ),
434                        ],
435                        workgroups,
436                    );
437                }
438
439                let t0 = std::time::Instant::now();
440                for _ in 0..self.iterations {
441                    let _ = backend.dispatch_wgsl(
442                        COPY_WGSL,
443                        "main",
444                        &[
445                            (in_buf, wgpu::BufferBindingType::Storage { read_only: true }),
446                            (
447                                out_buf,
448                                wgpu::BufferBindingType::Storage { read_only: false },
449                            ),
450                        ],
451                        workgroups,
452                    );
453                }
454                let total_wgpu = t0.elapsed();
455                let mean_wgpu = total_wgpu / self.iterations;
456                let mflops_wgpu = flops / (total_wgpu.as_secs_f64() / self.iterations as f64) / 1e6;
457
458                let wgpu_report = GpuBenchReport {
459                    name: "wgpu_copy_dispatch".to_string(),
460                    backend: BackendKind::Wgpu,
461                    n,
462                    iterations: self.iterations,
463                    total: total_wgpu,
464                    mean: mean_wgpu,
465                    mflops: Some(mflops_wgpu),
466                };
467                out.push(wgpu_report.clone());
468                self.reports.push(wgpu_report);
469            }
470        }
471
472        out
473    }
474
475    /// Benchmark the SPH density kernel on CPU and wgpu backends side-by-side.
476    ///
477    /// Builds an SPH simulation with `n` particles arranged in a uniform grid
478    /// inside the domain `[-10, 10]³`. Runs the full `SphSimulation::step`
479    /// (density + pressure + accel + integrate) on both backends and returns
480    /// timing reports.
481    ///
482    /// If no wgpu adapter is available, only the CPU report is returned.
483    ///
484    /// # Example
485    /// ```
486    /// let mut h = oxiphysics_gpu::gpu_bench::GpuBenchHarness::new();
487    /// let reports = h.cpu_vs_wgpu_sph(64);
488    /// assert!(!reports.is_empty());
489    /// ```
490    pub fn cpu_vs_wgpu_sph(&mut self, n: usize) -> Vec<GpuBenchReport> {
491        let cfg = SphConfig {
492            n_particles: n,
493            smoothing_h: 0.1,
494            rest_density: 1000.0,
495            gravity: 0.0,
496            domain_min: [-10.; 3],
497            domain_max: [10.; 3],
498            ..SphConfig::default()
499        };
500
501        let mut out = Vec::new();
502
503        // ── CPU path ──────────────────────────────────────────────────────────
504        {
505            let mut sim = SphSimulation::new(cfg.clone());
506            let side = (n as f64).cbrt().ceil() as usize + 1;
507            for (idx, i) in (0..n).enumerate() {
508                let x = (idx % side) as f64 * 0.1 - 5.0;
509                let y = ((idx / side) % side) as f64 * 0.1;
510                let z = (idx / (side * side)) as f64 * 0.1;
511                sim.state.pos_x[i] = x;
512                sim.state.pos_y[i] = y;
513                sim.state.pos_z[i] = z;
514            }
515            for _ in 0..self.warmup {
516                sim.step(1.0 / 60.0);
517            }
518            let t0 = Instant::now();
519            for _ in 0..self.iterations {
520                sim.step(1.0 / 60.0);
521            }
522            let total = t0.elapsed();
523
524            let r = GpuBenchReport {
525                name: "sph_density_cpu".to_string(),
526                backend: BackendKind::Cpu,
527                n,
528                iterations: self.iterations,
529                total,
530                mean: total / self.iterations,
531                mflops: None,
532            };
533            out.push(r.clone());
534            self.reports.push(r);
535        }
536
537        // ── wgpu path (runtime-gated) ─────────────────────────────────────────
538        if crate::compute::WgpuBackend::try_new().is_ok() {
539            let mut sim = SphSimulation::new(cfg.clone());
540            let side = (n as f64).cbrt().ceil() as usize + 1;
541            for (idx, i) in (0..n).enumerate() {
542                sim.state.pos_x[i] = (idx % side) as f64 * 0.1 - 5.0;
543                sim.state.pos_y[i] = ((idx / side) % side) as f64 * 0.1;
544                sim.state.pos_z[i] = (idx / (side * side)) as f64 * 0.1;
545            }
546
547            for _ in 0..self.warmup {
548                sim.step(1.0 / 60.0);
549            }
550            let t0 = Instant::now();
551            for _ in 0..self.iterations {
552                sim.step(1.0 / 60.0);
553            }
554            let total = t0.elapsed();
555
556            let backend = if sim.has_gpu() {
557                BackendKind::Wgpu
558            } else {
559                BackendKind::Cpu
560            };
561
562            let r = GpuBenchReport {
563                name: "sph_density_wgpu".to_string(),
564                backend,
565                n,
566                iterations: self.iterations,
567                total,
568                mean: total / self.iterations,
569                mflops: None,
570            };
571            out.push(r.clone());
572            self.reports.push(r);
573        }
574
575        out
576    }
577
578    /// Run SPH density on CPU and (optionally) CUDA; return timing reports.
579    ///
580    /// The CPU path runs the same `SphSimulation::step` loop as
581    /// [`Self::cpu_vs_wgpu_sph`] but is tagged with `"cuda_sph_density_cpu"`.
582    ///
583    /// Under the `cuda-backend` feature, a second report is added when a CUDA
584    /// device is available at runtime.  If no CUDA driver is present (e.g. on
585    /// macOS) only the CPU report is returned — no panic.
586    ///
587    /// # Example
588    /// ```
589    /// let mut h = oxiphysics_gpu::gpu_bench::GpuBenchHarness::new();
590    /// let reports = h.cpu_vs_cuda_sph(64);
591    /// assert!(!reports.is_empty());
592    /// assert!(reports[0].name.contains("sph_density"));
593    /// assert!(reports[0].mean > std::time::Duration::ZERO);
594    /// ```
595    pub fn cpu_vs_cuda_sph(&mut self, n: usize) -> Vec<GpuBenchReport> {
596        let cfg = crate::sph_gpu::SphConfig {
597            n_particles: n,
598            smoothing_h: 0.1,
599            rest_density: 1000.0,
600            gravity: 0.0,
601            domain_min: [-10.; 3],
602            domain_max: [10.; 3],
603            ..crate::sph_gpu::SphConfig::default()
604        };
605        let mut out = Vec::new();
606
607        // ── CPU path ──────────────────────────────────────────────────────────
608        {
609            let mut sim = crate::sph_gpu::SphSimulation::new(cfg.clone());
610            let side = (n as f64).cbrt().ceil() as usize + 1;
611            for idx in 0..n {
612                let x = (idx % side) as f64 * 0.1 - 5.0;
613                let y = ((idx / side) % side) as f64 * 0.1;
614                let z = (idx / (side * side)) as f64 * 0.1;
615                sim.state.pos_x[idx] = x;
616                sim.state.pos_y[idx] = y;
617                sim.state.pos_z[idx] = z;
618            }
619            for _ in 0..self.warmup {
620                sim.step(1.0 / 60.0);
621            }
622            let t0 = Instant::now();
623            for _ in 0..self.iterations {
624                sim.step(1.0 / 60.0);
625            }
626            let total = t0.elapsed();
627
628            let r = GpuBenchReport {
629                name: "cuda_sph_density_cpu".to_string(),
630                backend: BackendKind::Cpu,
631                n,
632                iterations: self.iterations,
633                total,
634                mean: total / self.iterations,
635                mflops: None,
636            };
637            out.push(r.clone());
638            self.reports.push(r);
639        }
640
641        // ── CUDA path (feature + runtime gated) ───────────────────────────────
642        #[cfg(feature = "cuda-backend")]
643        {
644            use crate::compute::cuda_backend::{CUDA_SPH_DENSITY_SRC, CudaBackend};
645
646            if let Ok(mut backend) = CudaBackend::try_new(0) {
647                // Compile and register the SPH density kernel.
648                let compiled =
649                    backend.compile_and_register("sph_density_kernel", CUDA_SPH_DENSITY_SRC);
650                if compiled.is_ok() {
651                    // Build position buffer (n × 3 doubles, interleaved xyz).
652                    let side = (n as f64).cbrt().ceil() as usize + 1;
653                    let mut positions = vec![0.0_f64; n * 3];
654                    for idx in 0..n {
655                        positions[3 * idx] = (idx % side) as f64 * 0.1 - 5.0;
656                        positions[3 * idx + 1] = ((idx / side) % side) as f64 * 0.1;
657                        positions[3 * idx + 2] = (idx / (side * side)) as f64 * 0.1;
658                    }
659
660                    let pos_buf = backend.create_buffer(n * 3);
661                    let den_buf = backend.create_buffer(n);
662                    backend.write_buffer(pos_buf, &positions);
663
664                    let block_x: u32 = 256;
665                    let grid_x = (n as u32).div_ceil(block_x);
666
667                    // The kernel signature is:
668                    //   sph_density_kernel(const double*, double*, int, double, double)
669                    // so we forward (n_particles, smoothing_h, particle_mass)
670                    // as scalar arguments after the two buffer arguments.
671                    let n_i32 = [n as i32];
672                    let scalars_f64 = [
673                        cfg.smoothing_h,
674                        // Use a unit particle mass if the config left it as
675                        // zero (cpu_vs_cuda_sph does not run SphSimulation::new
676                        // for the GPU path, so the default 0.0 mass is fine to
677                        // override for a smoke benchmark).
678                        if cfg.particle_mass > 0.0 {
679                            cfg.particle_mass
680                        } else {
681                            1.0
682                        },
683                    ];
684
685                    // Warm-up
686                    for _ in 0..self.warmup {
687                        backend.launch_with_scalars(
688                            "sph_density_kernel",
689                            &[pos_buf, den_buf],
690                            &n_i32,
691                            &scalars_f64,
692                            grid_x,
693                            block_x,
694                        );
695                        backend.synchronize();
696                    }
697
698                    let t0 = Instant::now();
699                    for _ in 0..self.iterations {
700                        backend.launch_with_scalars(
701                            "sph_density_kernel",
702                            &[pos_buf, den_buf],
703                            &n_i32,
704                            &scalars_f64,
705                            grid_x,
706                            block_x,
707                        );
708                        backend.synchronize();
709                    }
710                    let total = t0.elapsed();
711
712                    let r = GpuBenchReport {
713                        name: "cuda_sph_density_gpu".to_string(),
714                        backend: BackendKind::Cuda,
715                        n,
716                        iterations: self.iterations,
717                        total,
718                        mean: total / self.iterations,
719                        mflops: None,
720                    };
721                    out.push(r.clone());
722                    self.reports.push(r);
723                }
724            }
725        }
726
727        out
728    }
729
730    /// Print a comparison table for all collected reports.
731    pub fn print_comparison(&self) {
732        println!("\n{:=<75}", "");
733        println!(
734            "{:<5} {:<22} {:>8} {:>12} {:>10}",
735            "Back", "Kernel", "N", "Mean (µs)", "MFLOPs"
736        );
737        println!("{:=<75}", "");
738        for r in &self.reports {
739            let mf = r.mflops.map_or("—".to_string(), |m| format!("{:.1}", m));
740            println!(
741                "{:<5} {:<22} {:>8} {:>12.3} {:>10}",
742                r.backend,
743                r.name,
744                r.n,
745                r.mean.as_secs_f64() * 1e6,
746                mf
747            );
748        }
749        println!("{:=<75}", "");
750    }
751}
752
753impl Default for GpuBenchHarness {
754    fn default() -> Self {
755        Self::new()
756    }
757}
758
759// ── SpeedupReport ─────────────────────────────────────────────────────────────
760
761/// Computed speedup between a CPU and a wgpu benchmark report.
762#[derive(Debug, Clone)]
763pub struct SpeedupReport {
764    /// Mean time for the baseline (CPU) backend.
765    pub cpu_mean: Duration,
766    /// Mean time for the accelerated (wgpu) backend, if available.
767    pub wgpu_mean: Option<Duration>,
768    /// Speedup ratio = cpu_mean / wgpu_mean, if wgpu was measured.
769    pub speedup: Option<f64>,
770}
771
772/// Compute a speedup ratio from a pair of bench reports.
773///
774/// Expects `reports[0]` to be the CPU report and `reports[1]` (if present)
775/// to be the wgpu report.  Returns `SpeedupReport { speedup: None }` if
776/// only one report is present (GPU unavailable).
777///
778/// # Example
779/// ```
780/// use oxiphysics_gpu::gpu_bench::{GpuBenchHarness, compute_speedup};
781/// let mut h = GpuBenchHarness::new();
782/// let reports = h.cpu_vs_wgpu_sph(64);
783/// let sr = compute_speedup(&reports);
784/// assert!(sr.cpu_mean.as_secs_f64() > 0.0);
785/// ```
786pub fn compute_speedup(reports: &[GpuBenchReport]) -> SpeedupReport {
787    let cpu_mean = reports.first().map(|r| r.mean).unwrap_or(Duration::ZERO);
788    let wgpu_mean = reports.get(1).map(|r| r.mean);
789    let speedup = wgpu_mean.map(|wm| {
790        if wm.as_secs_f64() > 0.0 {
791            cpu_mean.as_secs_f64() / wm.as_secs_f64()
792        } else {
793            f64::INFINITY
794        }
795    });
796    SpeedupReport {
797        cpu_mean,
798        wgpu_mean,
799        speedup,
800    }
801}
802
803// ── CudaSpeedupReport ─────────────────────────────────────────────────────────
804
805/// Computed speedup between a CPU and a CUDA benchmark report.
806#[derive(Debug, Clone)]
807pub struct CudaSpeedupReport {
808    /// Mean time for the baseline (CPU) backend.
809    pub cpu_mean: Duration,
810    /// Mean time for the CUDA backend, if available.
811    pub cuda_mean: Option<Duration>,
812    /// Speedup ratio = cpu_mean / cuda_mean, if CUDA was measured.
813    pub speedup: Option<f64>,
814}
815
816/// Compute a speedup ratio from a pair of bench reports produced by
817/// [`GpuBenchHarness::cpu_vs_cuda_sph`].
818///
819/// Expects `reports[0]` to be the CPU report and `reports[1]` (if present)
820/// to be the CUDA report.  Returns `CudaSpeedupReport { speedup: None }` if
821/// only one report is present (CUDA unavailable).
822///
823/// # Example
824/// ```
825/// use oxiphysics_gpu::gpu_bench::{GpuBenchHarness, compute_cuda_speedup};
826/// let mut h = GpuBenchHarness::new();
827/// let reports = h.cpu_vs_cuda_sph(64);
828/// let sr = compute_cuda_speedup(&reports);
829/// assert!(sr.cpu_mean.as_secs_f64() >= 0.0);
830/// ```
831pub fn compute_cuda_speedup(reports: &[GpuBenchReport]) -> CudaSpeedupReport {
832    let cpu_mean = reports.first().map(|r| r.mean).unwrap_or(Duration::ZERO);
833    let cuda_mean = reports.get(1).map(|r| r.mean);
834    let speedup = cuda_mean.map(|cm| {
835        if cm.as_secs_f64() > 0.0 {
836            cpu_mean.as_secs_f64() / cm.as_secs_f64()
837        } else {
838            f64::INFINITY
839        }
840    });
841    CudaSpeedupReport {
842        cpu_mean,
843        cuda_mean,
844        speedup,
845    }
846}
847
848// ── CPU helpers ───────────────────────────────────────────────────────────────
849
850/// Sequential inclusive prefix scan (Σ) on `f64` elements.
851///
852/// Returns a `Vec<f64>` where `out[i] = Σ_{j≤i} data[j]`.
853pub fn inclusive_scan_cpu(data: &[f64]) -> Vec<f64> {
854    let mut out = Vec::with_capacity(data.len());
855    let mut acc = 0.0_f64;
856    for &v in data {
857        acc += v;
858        out.push(acc);
859    }
860    out
861}
862
863// ── tests ─────────────────────────────────────────────────────────────────────
864
865#[cfg(test)]
866mod tests {
867    use super::*;
868
869    #[test]
870    fn test_available_backends_has_cpu() {
871        let b = GpuBenchHarness::available_backends();
872        assert!(b.contains(&BackendKind::Cpu));
873    }
874
875    #[test]
876    fn test_inclusive_scan() {
877        let data = vec![1.0, 2.0, 3.0, 4.0];
878        let out = inclusive_scan_cpu(&data);
879        assert_eq!(out, vec![1.0, 3.0, 6.0, 10.0]);
880    }
881
882    #[test]
883    fn test_bench_sph_density_returns_at_least_cpu() {
884        let mut h = GpuBenchHarness {
885            warmup: 0,
886            iterations: 1,
887            reports: Vec::new(),
888        };
889        let reports = h.bench_sph_density(8);
890        assert!(!reports.is_empty());
891        assert_eq!(reports[0].backend, BackendKind::Cpu);
892    }
893
894    #[test]
895    fn test_bench_lbm_step() {
896        let mut h = GpuBenchHarness {
897            warmup: 0,
898            iterations: 1,
899            reports: Vec::new(),
900        };
901        let reports = h.bench_lbm_step(4, 4, 4);
902        assert_eq!(reports.len(), 1);
903        assert_eq!(reports[0].n, 64);
904    }
905
906    #[test]
907    fn test_bench_parallel_scan() {
908        let mut h = GpuBenchHarness {
909            warmup: 0,
910            iterations: 1,
911            reports: Vec::new(),
912        };
913        let r = h.bench_parallel_scan(100);
914        assert_eq!(r.n, 100);
915        assert!(r.mflops.is_some());
916    }
917
918    #[test]
919    fn test_run_full_suite() {
920        let mut h = GpuBenchHarness {
921            warmup: 0,
922            iterations: 1,
923            reports: Vec::new(),
924        };
925        let summary = h.run_full_suite();
926        assert!(summary.contains("benchmarks"));
927    }
928
929    #[test]
930    fn test_backend_display() {
931        assert_eq!(format!("{}", BackendKind::Cpu), "CPU");
932        assert_eq!(format!("{}", BackendKind::Wgpu), "wgpu");
933        assert_eq!(format!("{}", BackendKind::Cuda), "CUDA");
934    }
935}
oxiphysics_gpu/gpu_bench.rs

oxiphysics_gpu/
gpu_bench.rs