Skip to main content

oxiphysics_gpu/
gpu_bench.rs

1// Copyright 2026 COOLJAPAN OU (Team KitaSan)
2// SPDX-License-Identifier: Apache-2.0
3
4//! Performance benchmarks comparing CPU, wgpu, and CUDA backends.
5//!
6//! This module measures throughput for the core compute kernels (SPH density,
7//! LBM collision, parallel scan) across available backends and reports
8//! wall-clock timing and effective GFLOP/s estimates.
9//!
10//! # Quick usage
11//!
12//! ```
13//! use oxiphysics_gpu::gpu_bench::{GpuBenchHarness, BackendKind};
14//!
15//! let mut h = GpuBenchHarness::new();
16//!
17//! // Benchmark SPH density summation for 256 particles
18//! let reports = h.bench_sph_density(256);
19//! for r in &reports {
20//!     println!("{}", r);
21//! }
22//!
23//! // Compare available backends
24//! let available = GpuBenchHarness::available_backends();
25//! assert!(available.contains(&BackendKind::Cpu));
26//! ```
27
28#![allow(dead_code)]
29#![allow(clippy::too_many_arguments)]
30
31use std::time::{Duration, Instant};
32
33use crate::lbm_gpu::{LbmConfig, LbmSimulation};
34use crate::sph_gpu::{SphConfig, SphSimulation};
35
36// ── BackendKind ───────────────────────────────────────────────────────────────
37
38/// Identifies a compute backend for benchmark reporting.
39#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
40pub enum BackendKind {
41    /// CPU (Rayon-parallel fallback).
42    Cpu,
43    /// wgpu (WebGPU compute shaders).
44    Wgpu,
45    /// CUDA via cudarc.
46    Cuda,
47}
48
49impl std::fmt::Display for BackendKind {
50    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
51        match self {
52            Self::Cpu => write!(f, "CPU"),
53            Self::Wgpu => write!(f, "wgpu"),
54            Self::Cuda => write!(f, "CUDA"),
55        }
56    }
57}
58
59// ── GpuBenchReport ────────────────────────────────────────────────────────────
60
61/// Result of a single GPU benchmark run.
62#[derive(Debug, Clone)]
63pub struct GpuBenchReport {
64    /// Kernel / benchmark name.
65    pub name: String,
66    /// Which backend was measured.
67    pub backend: BackendKind,
68    /// Problem size (particles, cells, …).
69    pub n: usize,
70    /// Number of timed iterations.
71    pub iterations: u32,
72    /// Total wall-clock time.
73    pub total: Duration,
74    /// Mean time per iteration.
75    pub mean: Duration,
76    /// Estimated throughput (MFLOP/s or Mparticles/s depending on kernel).
77    pub mflops: Option<f64>,
78}
79
80impl std::fmt::Display for GpuBenchReport {
81    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
82        write!(
83            f,
84            "[{:<5} {:>20}] n={:>6} mean={:.3}µs",
85            self.backend,
86            self.name,
87            self.n,
88            self.mean.as_secs_f64() * 1e6
89        )?;
90        if let Some(mf) = self.mflops {
91            write!(f, "  {:.1} MFLOPs", mf)?;
92        }
93        Ok(())
94    }
95}
96
97// ── GpuBenchHarness ───────────────────────────────────────────────────────────
98
99/// Timing harness for GPU/CPU backend comparison benchmarks.
100pub struct GpuBenchHarness {
101    /// Warm-up iterations (not timed).
102    pub warmup: u32,
103    /// Timed iterations.
104    pub iterations: u32,
105    /// Collected reports.
106    pub reports: Vec<GpuBenchReport>,
107}
108
109impl GpuBenchHarness {
110    /// Create a harness with 2 warm-up and 5 timed iterations.
111    pub fn new() -> Self {
112        Self {
113            warmup: 2,
114            iterations: 5,
115            reports: Vec::new(),
116        }
117    }
118
119    /// Return which backends are available in this build.
120    ///
121    /// CPU is always available.  wgpu and CUDA depend on feature flags and
122    /// device availability (they appear in the list only when initialisation
123    /// succeeds).
124    pub fn available_backends() -> Vec<BackendKind> {
125        let mut out = vec![BackendKind::Cpu];
126
127        // Try wgpu — succeeds when the GPU driver is present.
128        if crate::compute::WgpuBackend::try_new().is_ok() {
129            out.push(BackendKind::Wgpu);
130        }
131
132        // Try CUDA — this is a stub that always reports unavailable.
133        if crate::compute::cuda_backend::CudaBackend::try_new(0).is_ok() {
134            out.push(BackendKind::Cuda);
135        }
136
137        out
138    }
139
140    // ── SPH density benchmark ─────────────────────────────────────────────────
141
142    /// Benchmark SPH density summation for `n` particles on all available backends.
143    ///
144    /// Each particle's density is recomputed from scratch each call to avoid
145    /// caching effects.  FLOPs estimated as 10 × N² (distance + kernel eval).
146    pub fn bench_sph_density(&mut self, n: usize) -> Vec<GpuBenchReport> {
147        let cfg = SphConfig {
148            n_particles: n,
149            smoothing_h: 0.1,
150            rest_density: 1000.0,
151            gravity: 0.0, // no gravity — pure density bench
152            domain_min: [-10.; 3],
153            domain_max: [10.; 3],
154            ..SphConfig::default()
155        };
156
157        let mut out = Vec::new();
158
159        // ── CPU path ──────────────────────────────────────────────────────────
160        {
161            // Build a CPU-only sim (no GPU backend will be tried)
162            let mut sim = SphSimulation::new(cfg.clone());
163            // Scatter particles in a regular grid
164            let side = (n as f64).cbrt().ceil() as usize + 1;
165            for (idx, i) in (0..n).enumerate() {
166                let x = (idx % side) as f64 * 0.1 - 5.0;
167                let y = ((idx / side) % side) as f64 * 0.1;
168                let z = (idx / (side * side)) as f64 * 0.1;
169                sim.state.pos_x[i] = x;
170                sim.state.pos_y[i] = y;
171                sim.state.pos_z[i] = z;
172            }
173
174            // Warm-up
175            for _ in 0..self.warmup {
176                sim.step(1.0 / 60.0);
177            }
178
179            let t0 = Instant::now();
180            for _ in 0..self.iterations {
181                sim.step(1.0 / 60.0);
182            }
183            let total = t0.elapsed();
184
185            let flops = 10.0 * n as f64 * n as f64;
186            let mflops = flops / (total.as_secs_f64() / self.iterations as f64) / 1e6;
187
188            let r = GpuBenchReport {
189                name: "sph_density".to_string(),
190                backend: BackendKind::Cpu,
191                n,
192                iterations: self.iterations,
193                total,
194                mean: total / self.iterations,
195                mflops: Some(mflops),
196            };
197            out.push(r.clone());
198            self.reports.push(r);
199        }
200
201        // ── wgpu path (if available) ──────────────────────────────────────────
202        if crate::compute::WgpuBackend::try_new().is_ok() {
203            let mut sim = SphSimulation::new(cfg.clone());
204            let side = (n as f64).cbrt().ceil() as usize + 1;
205            for (idx, i) in (0..n).enumerate() {
206                sim.state.pos_x[i] = (idx % side) as f64 * 0.1 - 5.0;
207                sim.state.pos_y[i] = ((idx / side) % side) as f64 * 0.1;
208                sim.state.pos_z[i] = (idx / (side * side)) as f64 * 0.1;
209            }
210
211            for _ in 0..self.warmup {
212                sim.step(1.0 / 60.0);
213            }
214            let t0 = Instant::now();
215            for _ in 0..self.iterations {
216                sim.step(1.0 / 60.0);
217            }
218            let total = t0.elapsed();
219
220            let backend = if sim.has_gpu() {
221                BackendKind::Wgpu
222            } else {
223                BackendKind::Cpu
224            };
225            let flops = 10.0 * n as f64 * n as f64;
226            let mflops = flops / (total.as_secs_f64() / self.iterations as f64) / 1e6;
227
228            let r = GpuBenchReport {
229                name: "sph_density".to_string(),
230                backend,
231                n,
232                iterations: self.iterations,
233                total,
234                mean: total / self.iterations,
235                mflops: Some(mflops),
236            };
237            out.push(r.clone());
238            self.reports.push(r);
239        }
240
241        out
242    }
243
244    // ── LBM benchmark ─────────────────────────────────────────────────────────
245
246    /// Benchmark one LBM BGK step on an `nx × ny × nz` domain.
247    ///
248    /// FLOPs estimated as 120 × nc (19 distribution reads + BGK + streaming).
249    pub fn bench_lbm_step(&mut self, nx: usize, ny: usize, nz: usize) -> Vec<GpuBenchReport> {
250        let cfg = LbmConfig {
251            nx,
252            ny,
253            nz,
254            tau: 0.6,
255            rho0: 1.0,
256            force_x: 0.0,
257            force_y: 0.0,
258            force_z: 0.0,
259        };
260        let nc = nx * ny * nz;
261        let mut out = Vec::new();
262
263        // CPU path
264        {
265            let mut sim = LbmSimulation::new(cfg.clone());
266            sim.set_lid_velocity(0.1, 0.0, 0.0);
267
268            for _ in 0..self.warmup {
269                sim.step();
270            }
271            let t0 = Instant::now();
272            for _ in 0..self.iterations {
273                sim.step();
274            }
275            let total = t0.elapsed();
276
277            let flops = 120.0 * nc as f64;
278            let mflops = flops / (total.as_secs_f64() / self.iterations as f64) / 1e6;
279
280            let r = GpuBenchReport {
281                name: format!("lbm_bgk_{}x{}x{}", nx, ny, nz),
282                backend: BackendKind::Cpu,
283                n: nc,
284                iterations: self.iterations,
285                total,
286                mean: total / self.iterations,
287                mflops: Some(mflops),
288            };
289            out.push(r.clone());
290            self.reports.push(r);
291        }
292
293        out
294    }
295
296    // ── Particle scan benchmark ───────────────────────────────────────────────
297
298    /// Benchmark parallel prefix scan on `n` f64 elements (CPU Rayon scan).
299    ///
300    /// FLOPs = 2n (N adds in up-sweep + N adds in down-sweep).
301    pub fn bench_parallel_scan(&mut self, n: usize) -> GpuBenchReport {
302        let data: Vec<f64> = (0..n).map(|i| i as f64 + 1.0).collect();
303
304        for _ in 0..self.warmup {
305            let _ = inclusive_scan_cpu(&data);
306        }
307        let t0 = Instant::now();
308        let mut result = Vec::new();
309        for _ in 0..self.iterations {
310            result = inclusive_scan_cpu(&data);
311        }
312        let total = t0.elapsed();
313        let _ = result;
314
315        let flops = 2.0 * n as f64;
316        let mflops = flops / (total.as_secs_f64() / self.iterations as f64) / 1e6;
317
318        let r = GpuBenchReport {
319            name: "parallel_scan".to_string(),
320            backend: BackendKind::Cpu,
321            n,
322            iterations: self.iterations,
323            total,
324            mean: total / self.iterations,
325            mflops: Some(mflops),
326        };
327        self.reports.push(r.clone());
328        r
329    }
330
331    // ── Full suite ────────────────────────────────────────────────────────────
332
333    /// Run the complete GPU benchmark suite and return a formatted summary.
334    ///
335    /// ```
336    /// use oxiphysics_gpu::gpu_bench::GpuBenchHarness;
337    /// let mut h = GpuBenchHarness::new();
338    /// let summary = h.run_full_suite();
339    /// assert!(!summary.is_empty());
340    /// ```
341    pub fn run_full_suite(&mut self) -> String {
342        self.bench_sph_density(64);
343        self.bench_sph_density(256);
344        self.bench_lbm_step(8, 8, 8);
345        self.bench_lbm_step(16, 16, 4);
346        self.bench_parallel_scan(1024);
347        self.bench_parallel_scan(65536);
348
349        let mut out = format!("{} benchmarks\n", self.reports.len());
350        for r in &self.reports {
351            out.push_str(&format!("  {}\n", r));
352        }
353        out
354    }
355
356    /// Benchmark CPU inclusive scan vs wgpu copy dispatch for `n` f64 elements.
357    ///
358    /// Both sides operate on the same data (a ramp of 0.0..n).  The wgpu side
359    /// dispatches a copy shader (since f32 on-device means scan parity is a
360    /// different test).  Returns a `Vec` with one CPU report, and optionally one
361    /// wgpu report if an adapter is available.
362    ///
363    /// If no GPU adapter is present, only the CPU report is returned (no panic).
364    ///
365    /// ```
366    /// use oxiphysics_gpu::gpu_bench::GpuBenchHarness;
367    /// let mut h = GpuBenchHarness::new();
368    /// let reports = h.cpu_vs_wgpu_comparison(1000);
369    /// assert!(!reports.is_empty());
370    /// assert_eq!(reports[0].name, "cpu_copy_scan");
371    /// ```
372    pub fn cpu_vs_wgpu_comparison(&mut self, n: usize) -> Vec<GpuBenchReport> {
373        let mut out = Vec::new();
374
375        // ── CPU path: inclusive scan ──────────────────────────────────────────
376        let data: Vec<f64> = (0..n).map(|i| i as f64).collect();
377        for _ in 0..self.warmup {
378            let _ = inclusive_scan_cpu(&data);
379        }
380        let t0 = std::time::Instant::now();
381        for _ in 0..self.iterations {
382            let _ = inclusive_scan_cpu(&data);
383        }
384        let total_cpu = t0.elapsed();
385        let mean_cpu = total_cpu / self.iterations;
386        let flops = 2.0 * n as f64;
387        let mflops_cpu = flops / (total_cpu.as_secs_f64() / self.iterations as f64) / 1e6;
388
389        let cpu_report = GpuBenchReport {
390            name: "cpu_copy_scan".to_string(),
391            backend: BackendKind::Cpu,
392            n,
393            iterations: self.iterations,
394            total: total_cpu,
395            mean: mean_cpu,
396            mflops: Some(mflops_cpu),
397        };
398        out.push(cpu_report.clone());
399        self.reports.push(cpu_report);
400
401        // ── wgpu path (feature-gated) ─────────────────────────────────────────
402        #[cfg(feature = "wgpu-backend")]
403        {
404            use crate::compute::wgpu_backend::real::WgpuBackendReal;
405
406            let backend_result = WgpuBackendReal::try_new();
407            if let Ok(mut backend) = backend_result {
408                const COPY_WGSL: &str = r#"
409@group(0) @binding(0) var<storage, read>       in_buf:  array<f32>;
410@group(0) @binding(1) var<storage, read_write> out_buf: array<f32>;
411
412@compute @workgroup_size(64)
413fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
414    let i = gid.x;
415    if (i < arrayLength(&in_buf)) {
416        out_buf[i] = in_buf[i];
417    }
418}
419"#;
420                let in_buf = backend.create_buffer_f64(n);
421                let out_buf = backend.create_buffer_f64(n);
422                backend.write_buffer_f64(in_buf, &data);
423
424                let workgroups = WgpuBackendReal::dispatch_count_for(n, 64);
425
426                // Warm-up
427                for _ in 0..self.warmup {
428                    let _ = backend.dispatch_wgsl(
429                        COPY_WGSL,
430                        "main",
431                        &[
432                            (in_buf, wgpu::BufferBindingType::Storage { read_only: true }),
433                            (
434                                out_buf,
435                                wgpu::BufferBindingType::Storage { read_only: false },
436                            ),
437                        ],
438                        workgroups,
439                    );
440                }
441
442                let t0 = std::time::Instant::now();
443                for _ in 0..self.iterations {
444                    let _ = backend.dispatch_wgsl(
445                        COPY_WGSL,
446                        "main",
447                        &[
448                            (in_buf, wgpu::BufferBindingType::Storage { read_only: true }),
449                            (
450                                out_buf,
451                                wgpu::BufferBindingType::Storage { read_only: false },
452                            ),
453                        ],
454                        workgroups,
455                    );
456                }
457                let total_wgpu = t0.elapsed();
458                let mean_wgpu = total_wgpu / self.iterations;
459                let mflops_wgpu = flops / (total_wgpu.as_secs_f64() / self.iterations as f64) / 1e6;
460
461                let wgpu_report = GpuBenchReport {
462                    name: "wgpu_copy_dispatch".to_string(),
463                    backend: BackendKind::Wgpu,
464                    n,
465                    iterations: self.iterations,
466                    total: total_wgpu,
467                    mean: mean_wgpu,
468                    mflops: Some(mflops_wgpu),
469                };
470                out.push(wgpu_report.clone());
471                self.reports.push(wgpu_report);
472            }
473        }
474
475        out
476    }
477
478    /// Benchmark the SPH density kernel on CPU and wgpu backends side-by-side.
479    ///
480    /// Builds an SPH simulation with `n` particles arranged in a uniform grid
481    /// inside the domain `[-10, 10]³`. Runs the full `SphSimulation::step`
482    /// (density + pressure + accel + integrate) on both backends and returns
483    /// timing reports.
484    ///
485    /// If no wgpu adapter is available, only the CPU report is returned.
486    ///
487    /// # Example
488    /// ```
489    /// let mut h = oxiphysics_gpu::gpu_bench::GpuBenchHarness::new();
490    /// let reports = h.cpu_vs_wgpu_sph(64);
491    /// assert!(!reports.is_empty());
492    /// ```
493    pub fn cpu_vs_wgpu_sph(&mut self, n: usize) -> Vec<GpuBenchReport> {
494        let cfg = SphConfig {
495            n_particles: n,
496            smoothing_h: 0.1,
497            rest_density: 1000.0,
498            gravity: 0.0,
499            domain_min: [-10.; 3],
500            domain_max: [10.; 3],
501            ..SphConfig::default()
502        };
503
504        let mut out = Vec::new();
505
506        // ── CPU path ──────────────────────────────────────────────────────────
507        {
508            let mut sim = SphSimulation::new(cfg.clone());
509            let side = (n as f64).cbrt().ceil() as usize + 1;
510            for (idx, i) in (0..n).enumerate() {
511                let x = (idx % side) as f64 * 0.1 - 5.0;
512                let y = ((idx / side) % side) as f64 * 0.1;
513                let z = (idx / (side * side)) as f64 * 0.1;
514                sim.state.pos_x[i] = x;
515                sim.state.pos_y[i] = y;
516                sim.state.pos_z[i] = z;
517            }
518            for _ in 0..self.warmup {
519                sim.step(1.0 / 60.0);
520            }
521            let t0 = Instant::now();
522            for _ in 0..self.iterations {
523                sim.step(1.0 / 60.0);
524            }
525            let total = t0.elapsed();
526
527            let r = GpuBenchReport {
528                name: "sph_density_cpu".to_string(),
529                backend: BackendKind::Cpu,
530                n,
531                iterations: self.iterations,
532                total,
533                mean: total / self.iterations,
534                mflops: None,
535            };
536            out.push(r.clone());
537            self.reports.push(r);
538        }
539
540        // ── wgpu path (runtime-gated) ─────────────────────────────────────────
541        if crate::compute::WgpuBackend::try_new().is_ok() {
542            let mut sim = SphSimulation::new(cfg.clone());
543            let side = (n as f64).cbrt().ceil() as usize + 1;
544            for (idx, i) in (0..n).enumerate() {
545                sim.state.pos_x[i] = (idx % side) as f64 * 0.1 - 5.0;
546                sim.state.pos_y[i] = ((idx / side) % side) as f64 * 0.1;
547                sim.state.pos_z[i] = (idx / (side * side)) as f64 * 0.1;
548            }
549
550            for _ in 0..self.warmup {
551                sim.step(1.0 / 60.0);
552            }
553            let t0 = Instant::now();
554            for _ in 0..self.iterations {
555                sim.step(1.0 / 60.0);
556            }
557            let total = t0.elapsed();
558
559            let backend = if sim.has_gpu() {
560                BackendKind::Wgpu
561            } else {
562                BackendKind::Cpu
563            };
564
565            let r = GpuBenchReport {
566                name: "sph_density_wgpu".to_string(),
567                backend,
568                n,
569                iterations: self.iterations,
570                total,
571                mean: total / self.iterations,
572                mflops: None,
573            };
574            out.push(r.clone());
575            self.reports.push(r);
576        }
577
578        out
579    }
580
581    /// Run SPH density on CPU and (optionally) CUDA; return timing reports.
582    ///
583    /// The CPU path runs the same `SphSimulation::step` loop as
584    /// [`Self::cpu_vs_wgpu_sph`] but is tagged with `"cuda_sph_density_cpu"`.
585    ///
586    /// Under the `cuda-backend` feature, a second report is added when a CUDA
587    /// device is available at runtime.  If no CUDA driver is present (e.g. on
588    /// macOS) only the CPU report is returned — no panic.
589    ///
590    /// # Example
591    /// ```
592    /// let mut h = oxiphysics_gpu::gpu_bench::GpuBenchHarness::new();
593    /// let reports = h.cpu_vs_cuda_sph(64);
594    /// assert!(!reports.is_empty());
595    /// assert!(reports[0].name.contains("sph_density"));
596    /// assert!(reports[0].mean > std::time::Duration::ZERO);
597    /// ```
598    pub fn cpu_vs_cuda_sph(&mut self, n: usize) -> Vec<GpuBenchReport> {
599        let cfg = crate::sph_gpu::SphConfig {
600            n_particles: n,
601            smoothing_h: 0.1,
602            rest_density: 1000.0,
603            gravity: 0.0,
604            domain_min: [-10.; 3],
605            domain_max: [10.; 3],
606            ..crate::sph_gpu::SphConfig::default()
607        };
608        let mut out = Vec::new();
609
610        // ── CPU path ──────────────────────────────────────────────────────────
611        {
612            let mut sim = crate::sph_gpu::SphSimulation::new(cfg.clone());
613            let side = (n as f64).cbrt().ceil() as usize + 1;
614            for idx in 0..n {
615                let x = (idx % side) as f64 * 0.1 - 5.0;
616                let y = ((idx / side) % side) as f64 * 0.1;
617                let z = (idx / (side * side)) as f64 * 0.1;
618                sim.state.pos_x[idx] = x;
619                sim.state.pos_y[idx] = y;
620                sim.state.pos_z[idx] = z;
621            }
622            for _ in 0..self.warmup {
623                sim.step(1.0 / 60.0);
624            }
625            let t0 = Instant::now();
626            for _ in 0..self.iterations {
627                sim.step(1.0 / 60.0);
628            }
629            let total = t0.elapsed();
630
631            let r = GpuBenchReport {
632                name: "cuda_sph_density_cpu".to_string(),
633                backend: BackendKind::Cpu,
634                n,
635                iterations: self.iterations,
636                total,
637                mean: total / self.iterations,
638                mflops: None,
639            };
640            out.push(r.clone());
641            self.reports.push(r);
642        }
643
644        // ── CUDA path (feature + runtime gated) ───────────────────────────────
645        #[cfg(feature = "cuda-backend")]
646        {
647            use crate::compute::cuda_backend::{CUDA_SPH_DENSITY_SRC, CudaBackend};
648
649            if let Ok(mut backend) = CudaBackend::try_new(0) {
650                // Compile and register the SPH density kernel.
651                let compiled =
652                    backend.compile_and_register("sph_density_kernel", CUDA_SPH_DENSITY_SRC);
653                if compiled.is_ok() {
654                    // Build position buffer (n × 3 doubles, interleaved xyz).
655                    let side = (n as f64).cbrt().ceil() as usize + 1;
656                    let mut positions = vec![0.0_f64; n * 3];
657                    for idx in 0..n {
658                        positions[3 * idx] = (idx % side) as f64 * 0.1 - 5.0;
659                        positions[3 * idx + 1] = ((idx / side) % side) as f64 * 0.1;
660                        positions[3 * idx + 2] = (idx / (side * side)) as f64 * 0.1;
661                    }
662
663                    let pos_buf = backend.create_buffer(n * 3);
664                    let den_buf = backend.create_buffer(n);
665                    backend.write_buffer(pos_buf, &positions);
666
667                    let block_x: u32 = 256;
668                    let grid_x = (n as u32).div_ceil(block_x);
669
670                    // The kernel signature is:
671                    //   sph_density_kernel(const double*, double*, int, double, double)
672                    // so we forward (n_particles, smoothing_h, particle_mass)
673                    // as scalar arguments after the two buffer arguments.
674                    let n_i32 = [n as i32];
675                    let scalars_f64 = [
676                        cfg.smoothing_h,
677                        // Use a unit particle mass if the config left it as
678                        // zero (cpu_vs_cuda_sph does not run SphSimulation::new
679                        // for the GPU path, so the default 0.0 mass is fine to
680                        // override for a smoke benchmark).
681                        if cfg.particle_mass > 0.0 {
682                            cfg.particle_mass
683                        } else {
684                            1.0
685                        },
686                    ];
687
688                    // Warm-up
689                    for _ in 0..self.warmup {
690                        backend.launch_with_scalars(
691                            "sph_density_kernel",
692                            &[pos_buf, den_buf],
693                            &n_i32,
694                            &scalars_f64,
695                            grid_x,
696                            block_x,
697                        );
698                        backend.synchronize();
699                    }
700
701                    let t0 = Instant::now();
702                    for _ in 0..self.iterations {
703                        backend.launch_with_scalars(
704                            "sph_density_kernel",
705                            &[pos_buf, den_buf],
706                            &n_i32,
707                            &scalars_f64,
708                            grid_x,
709                            block_x,
710                        );
711                        backend.synchronize();
712                    }
713                    let total = t0.elapsed();
714
715                    let r = GpuBenchReport {
716                        name: "cuda_sph_density_gpu".to_string(),
717                        backend: BackendKind::Cuda,
718                        n,
719                        iterations: self.iterations,
720                        total,
721                        mean: total / self.iterations,
722                        mflops: None,
723                    };
724                    out.push(r.clone());
725                    self.reports.push(r);
726                }
727            }
728        }
729
730        out
731    }
732
733    /// Print a comparison table for all collected reports.
734    pub fn print_comparison(&self) {
735        println!("\n{:=<75}", "");
736        println!(
737            "{:<5} {:<22} {:>8} {:>12} {:>10}",
738            "Back", "Kernel", "N", "Mean (µs)", "MFLOPs"
739        );
740        println!("{:=<75}", "");
741        for r in &self.reports {
742            let mf = r.mflops.map_or("—".to_string(), |m| format!("{:.1}", m));
743            println!(
744                "{:<5} {:<22} {:>8} {:>12.3} {:>10}",
745                r.backend,
746                r.name,
747                r.n,
748                r.mean.as_secs_f64() * 1e6,
749                mf
750            );
751        }
752        println!("{:=<75}", "");
753    }
754}
755
756impl Default for GpuBenchHarness {
757    fn default() -> Self {
758        Self::new()
759    }
760}
761
762// ── SpeedupReport ─────────────────────────────────────────────────────────────
763
764/// Computed speedup between a CPU and a wgpu benchmark report.
765#[derive(Debug, Clone)]
766pub struct SpeedupReport {
767    /// Mean time for the baseline (CPU) backend.
768    pub cpu_mean: Duration,
769    /// Mean time for the accelerated (wgpu) backend, if available.
770    pub wgpu_mean: Option<Duration>,
771    /// Speedup ratio = cpu_mean / wgpu_mean, if wgpu was measured.
772    pub speedup: Option<f64>,
773}
774
775/// Compute a speedup ratio from a pair of bench reports.
776///
777/// Expects `reports[0]` to be the CPU report and `reports[1]` (if present)
778/// to be the wgpu report.  Returns `SpeedupReport { speedup: None }` if
779/// only one report is present (GPU unavailable).
780///
781/// # Example
782/// ```
783/// use oxiphysics_gpu::gpu_bench::{GpuBenchHarness, compute_speedup};
784/// let mut h = GpuBenchHarness::new();
785/// let reports = h.cpu_vs_wgpu_sph(64);
786/// let sr = compute_speedup(&reports);
787/// assert!(sr.cpu_mean.as_secs_f64() > 0.0);
788/// ```
789pub fn compute_speedup(reports: &[GpuBenchReport]) -> SpeedupReport {
790    let cpu_mean = reports.first().map(|r| r.mean).unwrap_or(Duration::ZERO);
791    let wgpu_mean = reports.get(1).map(|r| r.mean);
792    let speedup = wgpu_mean.map(|wm| {
793        if wm.as_secs_f64() > 0.0 {
794            cpu_mean.as_secs_f64() / wm.as_secs_f64()
795        } else {
796            f64::INFINITY
797        }
798    });
799    SpeedupReport {
800        cpu_mean,
801        wgpu_mean,
802        speedup,
803    }
804}
805
806// ── CudaSpeedupReport ─────────────────────────────────────────────────────────
807
808/// Computed speedup between a CPU and a CUDA benchmark report.
809#[derive(Debug, Clone)]
810pub struct CudaSpeedupReport {
811    /// Mean time for the baseline (CPU) backend.
812    pub cpu_mean: Duration,
813    /// Mean time for the CUDA backend, if available.
814    pub cuda_mean: Option<Duration>,
815    /// Speedup ratio = cpu_mean / cuda_mean, if CUDA was measured.
816    pub speedup: Option<f64>,
817}
818
819/// Compute a speedup ratio from a pair of bench reports produced by
820/// [`GpuBenchHarness::cpu_vs_cuda_sph`].
821///
822/// Expects `reports[0]` to be the CPU report and `reports[1]` (if present)
823/// to be the CUDA report.  Returns `CudaSpeedupReport { speedup: None }` if
824/// only one report is present (CUDA unavailable).
825///
826/// # Example
827/// ```
828/// use oxiphysics_gpu::gpu_bench::{GpuBenchHarness, compute_cuda_speedup};
829/// let mut h = GpuBenchHarness::new();
830/// let reports = h.cpu_vs_cuda_sph(64);
831/// let sr = compute_cuda_speedup(&reports);
832/// assert!(sr.cpu_mean.as_secs_f64() >= 0.0);
833/// ```
834pub fn compute_cuda_speedup(reports: &[GpuBenchReport]) -> CudaSpeedupReport {
835    let cpu_mean = reports.first().map(|r| r.mean).unwrap_or(Duration::ZERO);
836    let cuda_mean = reports.get(1).map(|r| r.mean);
837    let speedup = cuda_mean.map(|cm| {
838        if cm.as_secs_f64() > 0.0 {
839            cpu_mean.as_secs_f64() / cm.as_secs_f64()
840        } else {
841            f64::INFINITY
842        }
843    });
844    CudaSpeedupReport {
845        cpu_mean,
846        cuda_mean,
847        speedup,
848    }
849}
850
851// ── CPU helpers ───────────────────────────────────────────────────────────────
852
853/// Sequential inclusive prefix scan (Σ) on `f64` elements.
854///
855/// Returns a `Vec<f64>` where `out[i] = Σ_{j≤i} data[j]`.
856pub fn inclusive_scan_cpu(data: &[f64]) -> Vec<f64> {
857    let mut out = Vec::with_capacity(data.len());
858    let mut acc = 0.0_f64;
859    for &v in data {
860        acc += v;
861        out.push(acc);
862    }
863    out
864}
865
866// ── tests ─────────────────────────────────────────────────────────────────────
867
868#[cfg(test)]
869mod tests {
870    use super::*;
871
872    #[test]
873    fn test_available_backends_has_cpu() {
874        let b = GpuBenchHarness::available_backends();
875        assert!(b.contains(&BackendKind::Cpu));
876    }
877
878    #[test]
879    fn test_inclusive_scan() {
880        let data = vec![1.0, 2.0, 3.0, 4.0];
881        let out = inclusive_scan_cpu(&data);
882        assert_eq!(out, vec![1.0, 3.0, 6.0, 10.0]);
883    }
884
885    #[test]
886    fn test_bench_sph_density_returns_at_least_cpu() {
887        let mut h = GpuBenchHarness {
888            warmup: 0,
889            iterations: 1,
890            reports: Vec::new(),
891        };
892        let reports = h.bench_sph_density(8);
893        assert!(!reports.is_empty());
894        assert_eq!(reports[0].backend, BackendKind::Cpu);
895    }
896
897    #[test]
898    fn test_bench_lbm_step() {
899        let mut h = GpuBenchHarness {
900            warmup: 0,
901            iterations: 1,
902            reports: Vec::new(),
903        };
904        let reports = h.bench_lbm_step(4, 4, 4);
905        assert_eq!(reports.len(), 1);
906        assert_eq!(reports[0].n, 64);
907    }
908
909    #[test]
910    fn test_bench_parallel_scan() {
911        let mut h = GpuBenchHarness {
912            warmup: 0,
913            iterations: 1,
914            reports: Vec::new(),
915        };
916        let r = h.bench_parallel_scan(100);
917        assert_eq!(r.n, 100);
918        assert!(r.mflops.is_some());
919    }
920
921    #[test]
922    fn test_run_full_suite() {
923        let mut h = GpuBenchHarness {
924            warmup: 0,
925            iterations: 1,
926            reports: Vec::new(),
927        };
928        let summary = h.run_full_suite();
929        assert!(summary.contains("benchmarks"));
930    }
931
932    #[test]
933    fn test_backend_display() {
934        assert_eq!(format!("{}", BackendKind::Cpu), "CPU");
935        assert_eq!(format!("{}", BackendKind::Wgpu), "wgpu");
936        assert_eq!(format!("{}", BackendKind::Cuda), "CUDA");
937    }
938}