1#![allow(dead_code)]
29#![allow(clippy::too_many_arguments)]
30
31use std::time::{Duration, Instant};
32
33use crate::lbm_gpu::{LbmConfig, LbmSimulation};
34use crate::sph_gpu::{SphConfig, SphSimulation};
35
36#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
40pub enum BackendKind {
41 Cpu,
43 Wgpu,
45 Cuda,
47}
48
49impl std::fmt::Display for BackendKind {
50 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
51 match self {
52 Self::Cpu => write!(f, "CPU"),
53 Self::Wgpu => write!(f, "wgpu"),
54 Self::Cuda => write!(f, "CUDA"),
55 }
56 }
57}
58
59#[derive(Debug, Clone)]
63pub struct GpuBenchReport {
64 pub name: String,
66 pub backend: BackendKind,
68 pub n: usize,
70 pub iterations: u32,
72 pub total: Duration,
74 pub mean: Duration,
76 pub mflops: Option<f64>,
78}
79
80impl std::fmt::Display for GpuBenchReport {
81 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
82 write!(
83 f,
84 "[{:<5} {:>20}] n={:>6} mean={:.3}µs",
85 self.backend,
86 self.name,
87 self.n,
88 self.mean.as_secs_f64() * 1e6
89 )?;
90 if let Some(mf) = self.mflops {
91 write!(f, " {:.1} MFLOPs", mf)?;
92 }
93 Ok(())
94 }
95}
96
97pub struct GpuBenchHarness {
101 pub warmup: u32,
103 pub iterations: u32,
105 pub reports: Vec<GpuBenchReport>,
107}
108
109impl GpuBenchHarness {
110 pub fn new() -> Self {
112 Self {
113 warmup: 2,
114 iterations: 5,
115 reports: Vec::new(),
116 }
117 }
118
119 pub fn available_backends() -> Vec<BackendKind> {
125 let mut out = vec![BackendKind::Cpu];
126
127 if crate::compute::WgpuBackend::try_new().is_ok() {
129 out.push(BackendKind::Wgpu);
130 }
131
132 if crate::compute::cuda_backend::CudaBackend::try_new(0).is_ok() {
134 out.push(BackendKind::Cuda);
135 }
136
137 out
138 }
139
140 pub fn bench_sph_density(&mut self, n: usize) -> Vec<GpuBenchReport> {
147 let cfg = SphConfig {
148 n_particles: n,
149 smoothing_h: 0.1,
150 rest_density: 1000.0,
151 gravity: 0.0, domain_min: [-10.; 3],
153 domain_max: [10.; 3],
154 ..SphConfig::default()
155 };
156
157 let mut out = Vec::new();
158
159 {
161 let mut sim = SphSimulation::new(cfg.clone());
163 let side = (n as f64).cbrt().ceil() as usize + 1;
165 for (idx, i) in (0..n).enumerate() {
166 let x = (idx % side) as f64 * 0.1 - 5.0;
167 let y = ((idx / side) % side) as f64 * 0.1;
168 let z = (idx / (side * side)) as f64 * 0.1;
169 sim.state.pos_x[i] = x;
170 sim.state.pos_y[i] = y;
171 sim.state.pos_z[i] = z;
172 }
173
174 for _ in 0..self.warmup {
176 sim.step(1.0 / 60.0);
177 }
178
179 let t0 = Instant::now();
180 for _ in 0..self.iterations {
181 sim.step(1.0 / 60.0);
182 }
183 let total = t0.elapsed();
184
185 let flops = 10.0 * n as f64 * n as f64;
186 let mflops = flops / (total.as_secs_f64() / self.iterations as f64) / 1e6;
187
188 let r = GpuBenchReport {
189 name: "sph_density".to_string(),
190 backend: BackendKind::Cpu,
191 n,
192 iterations: self.iterations,
193 total,
194 mean: total / self.iterations,
195 mflops: Some(mflops),
196 };
197 out.push(r.clone());
198 self.reports.push(r);
199 }
200
201 if crate::compute::WgpuBackend::try_new().is_ok() {
203 let mut sim = SphSimulation::new(cfg.clone());
204 let side = (n as f64).cbrt().ceil() as usize + 1;
205 for (idx, i) in (0..n).enumerate() {
206 sim.state.pos_x[i] = (idx % side) as f64 * 0.1 - 5.0;
207 sim.state.pos_y[i] = ((idx / side) % side) as f64 * 0.1;
208 sim.state.pos_z[i] = (idx / (side * side)) as f64 * 0.1;
209 }
210
211 for _ in 0..self.warmup {
212 sim.step(1.0 / 60.0);
213 }
214 let t0 = Instant::now();
215 for _ in 0..self.iterations {
216 sim.step(1.0 / 60.0);
217 }
218 let total = t0.elapsed();
219
220 let backend = if sim.has_gpu() {
221 BackendKind::Wgpu
222 } else {
223 BackendKind::Cpu
224 };
225 let flops = 10.0 * n as f64 * n as f64;
226 let mflops = flops / (total.as_secs_f64() / self.iterations as f64) / 1e6;
227
228 let r = GpuBenchReport {
229 name: "sph_density".to_string(),
230 backend,
231 n,
232 iterations: self.iterations,
233 total,
234 mean: total / self.iterations,
235 mflops: Some(mflops),
236 };
237 out.push(r.clone());
238 self.reports.push(r);
239 }
240
241 out
242 }
243
244 pub fn bench_lbm_step(&mut self, nx: usize, ny: usize, nz: usize) -> Vec<GpuBenchReport> {
250 let cfg = LbmConfig {
251 nx,
252 ny,
253 nz,
254 tau: 0.6,
255 rho0: 1.0,
256 force_x: 0.0,
257 force_y: 0.0,
258 force_z: 0.0,
259 };
260 let nc = nx * ny * nz;
261 let mut out = Vec::new();
262
263 {
265 let mut sim = LbmSimulation::new(cfg.clone());
266 sim.set_lid_velocity(0.1, 0.0, 0.0);
267
268 for _ in 0..self.warmup {
269 sim.step();
270 }
271 let t0 = Instant::now();
272 for _ in 0..self.iterations {
273 sim.step();
274 }
275 let total = t0.elapsed();
276
277 let flops = 120.0 * nc as f64;
278 let mflops = flops / (total.as_secs_f64() / self.iterations as f64) / 1e6;
279
280 let r = GpuBenchReport {
281 name: format!("lbm_bgk_{}x{}x{}", nx, ny, nz),
282 backend: BackendKind::Cpu,
283 n: nc,
284 iterations: self.iterations,
285 total,
286 mean: total / self.iterations,
287 mflops: Some(mflops),
288 };
289 out.push(r.clone());
290 self.reports.push(r);
291 }
292
293 out
294 }
295
296 pub fn bench_parallel_scan(&mut self, n: usize) -> GpuBenchReport {
302 let data: Vec<f64> = (0..n).map(|i| i as f64 + 1.0).collect();
303
304 for _ in 0..self.warmup {
305 let _ = inclusive_scan_cpu(&data);
306 }
307 let t0 = Instant::now();
308 let mut result = Vec::new();
309 for _ in 0..self.iterations {
310 result = inclusive_scan_cpu(&data);
311 }
312 let total = t0.elapsed();
313 let _ = result;
314
315 let flops = 2.0 * n as f64;
316 let mflops = flops / (total.as_secs_f64() / self.iterations as f64) / 1e6;
317
318 let r = GpuBenchReport {
319 name: "parallel_scan".to_string(),
320 backend: BackendKind::Cpu,
321 n,
322 iterations: self.iterations,
323 total,
324 mean: total / self.iterations,
325 mflops: Some(mflops),
326 };
327 self.reports.push(r.clone());
328 r
329 }
330
331 pub fn run_full_suite(&mut self) -> String {
342 self.bench_sph_density(64);
343 self.bench_sph_density(256);
344 self.bench_lbm_step(8, 8, 8);
345 self.bench_lbm_step(16, 16, 4);
346 self.bench_parallel_scan(1024);
347 self.bench_parallel_scan(65536);
348
349 let mut out = format!("{} benchmarks\n", self.reports.len());
350 for r in &self.reports {
351 out.push_str(&format!(" {}\n", r));
352 }
353 out
354 }
355
356 pub fn cpu_vs_wgpu_comparison(&mut self, n: usize) -> Vec<GpuBenchReport> {
373 let mut out = Vec::new();
374
375 let data: Vec<f64> = (0..n).map(|i| i as f64).collect();
377 for _ in 0..self.warmup {
378 let _ = inclusive_scan_cpu(&data);
379 }
380 let t0 = std::time::Instant::now();
381 for _ in 0..self.iterations {
382 let _ = inclusive_scan_cpu(&data);
383 }
384 let total_cpu = t0.elapsed();
385 let mean_cpu = total_cpu / self.iterations;
386 let flops = 2.0 * n as f64;
387 let mflops_cpu = flops / (total_cpu.as_secs_f64() / self.iterations as f64) / 1e6;
388
389 let cpu_report = GpuBenchReport {
390 name: "cpu_copy_scan".to_string(),
391 backend: BackendKind::Cpu,
392 n,
393 iterations: self.iterations,
394 total: total_cpu,
395 mean: mean_cpu,
396 mflops: Some(mflops_cpu),
397 };
398 out.push(cpu_report.clone());
399 self.reports.push(cpu_report);
400
401 #[cfg(feature = "wgpu-backend")]
403 {
404 use crate::compute::wgpu_backend::real::WgpuBackendReal;
405
406 let backend_result = WgpuBackendReal::try_new();
407 if let Ok(mut backend) = backend_result {
408 const COPY_WGSL: &str = r#"
409@group(0) @binding(0) var<storage, read> in_buf: array<f32>;
410@group(0) @binding(1) var<storage, read_write> out_buf: array<f32>;
411
412@compute @workgroup_size(64)
413fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
414 let i = gid.x;
415 if (i < arrayLength(&in_buf)) {
416 out_buf[i] = in_buf[i];
417 }
418}
419"#;
420 let in_buf = backend.create_buffer_f64(n);
421 let out_buf = backend.create_buffer_f64(n);
422 backend.write_buffer_f64(in_buf, &data);
423
424 let workgroups = WgpuBackendReal::dispatch_count_for(n, 64);
425
426 for _ in 0..self.warmup {
428 let _ = backend.dispatch_wgsl(
429 COPY_WGSL,
430 "main",
431 &[
432 (in_buf, wgpu::BufferBindingType::Storage { read_only: true }),
433 (
434 out_buf,
435 wgpu::BufferBindingType::Storage { read_only: false },
436 ),
437 ],
438 workgroups,
439 );
440 }
441
442 let t0 = std::time::Instant::now();
443 for _ in 0..self.iterations {
444 let _ = backend.dispatch_wgsl(
445 COPY_WGSL,
446 "main",
447 &[
448 (in_buf, wgpu::BufferBindingType::Storage { read_only: true }),
449 (
450 out_buf,
451 wgpu::BufferBindingType::Storage { read_only: false },
452 ),
453 ],
454 workgroups,
455 );
456 }
457 let total_wgpu = t0.elapsed();
458 let mean_wgpu = total_wgpu / self.iterations;
459 let mflops_wgpu = flops / (total_wgpu.as_secs_f64() / self.iterations as f64) / 1e6;
460
461 let wgpu_report = GpuBenchReport {
462 name: "wgpu_copy_dispatch".to_string(),
463 backend: BackendKind::Wgpu,
464 n,
465 iterations: self.iterations,
466 total: total_wgpu,
467 mean: mean_wgpu,
468 mflops: Some(mflops_wgpu),
469 };
470 out.push(wgpu_report.clone());
471 self.reports.push(wgpu_report);
472 }
473 }
474
475 out
476 }
477
478 pub fn cpu_vs_wgpu_sph(&mut self, n: usize) -> Vec<GpuBenchReport> {
494 let cfg = SphConfig {
495 n_particles: n,
496 smoothing_h: 0.1,
497 rest_density: 1000.0,
498 gravity: 0.0,
499 domain_min: [-10.; 3],
500 domain_max: [10.; 3],
501 ..SphConfig::default()
502 };
503
504 let mut out = Vec::new();
505
506 {
508 let mut sim = SphSimulation::new(cfg.clone());
509 let side = (n as f64).cbrt().ceil() as usize + 1;
510 for (idx, i) in (0..n).enumerate() {
511 let x = (idx % side) as f64 * 0.1 - 5.0;
512 let y = ((idx / side) % side) as f64 * 0.1;
513 let z = (idx / (side * side)) as f64 * 0.1;
514 sim.state.pos_x[i] = x;
515 sim.state.pos_y[i] = y;
516 sim.state.pos_z[i] = z;
517 }
518 for _ in 0..self.warmup {
519 sim.step(1.0 / 60.0);
520 }
521 let t0 = Instant::now();
522 for _ in 0..self.iterations {
523 sim.step(1.0 / 60.0);
524 }
525 let total = t0.elapsed();
526
527 let r = GpuBenchReport {
528 name: "sph_density_cpu".to_string(),
529 backend: BackendKind::Cpu,
530 n,
531 iterations: self.iterations,
532 total,
533 mean: total / self.iterations,
534 mflops: None,
535 };
536 out.push(r.clone());
537 self.reports.push(r);
538 }
539
540 if crate::compute::WgpuBackend::try_new().is_ok() {
542 let mut sim = SphSimulation::new(cfg.clone());
543 let side = (n as f64).cbrt().ceil() as usize + 1;
544 for (idx, i) in (0..n).enumerate() {
545 sim.state.pos_x[i] = (idx % side) as f64 * 0.1 - 5.0;
546 sim.state.pos_y[i] = ((idx / side) % side) as f64 * 0.1;
547 sim.state.pos_z[i] = (idx / (side * side)) as f64 * 0.1;
548 }
549
550 for _ in 0..self.warmup {
551 sim.step(1.0 / 60.0);
552 }
553 let t0 = Instant::now();
554 for _ in 0..self.iterations {
555 sim.step(1.0 / 60.0);
556 }
557 let total = t0.elapsed();
558
559 let backend = if sim.has_gpu() {
560 BackendKind::Wgpu
561 } else {
562 BackendKind::Cpu
563 };
564
565 let r = GpuBenchReport {
566 name: "sph_density_wgpu".to_string(),
567 backend,
568 n,
569 iterations: self.iterations,
570 total,
571 mean: total / self.iterations,
572 mflops: None,
573 };
574 out.push(r.clone());
575 self.reports.push(r);
576 }
577
578 out
579 }
580
581 pub fn cpu_vs_cuda_sph(&mut self, n: usize) -> Vec<GpuBenchReport> {
599 let cfg = crate::sph_gpu::SphConfig {
600 n_particles: n,
601 smoothing_h: 0.1,
602 rest_density: 1000.0,
603 gravity: 0.0,
604 domain_min: [-10.; 3],
605 domain_max: [10.; 3],
606 ..crate::sph_gpu::SphConfig::default()
607 };
608 let mut out = Vec::new();
609
610 {
612 let mut sim = crate::sph_gpu::SphSimulation::new(cfg.clone());
613 let side = (n as f64).cbrt().ceil() as usize + 1;
614 for idx in 0..n {
615 let x = (idx % side) as f64 * 0.1 - 5.0;
616 let y = ((idx / side) % side) as f64 * 0.1;
617 let z = (idx / (side * side)) as f64 * 0.1;
618 sim.state.pos_x[idx] = x;
619 sim.state.pos_y[idx] = y;
620 sim.state.pos_z[idx] = z;
621 }
622 for _ in 0..self.warmup {
623 sim.step(1.0 / 60.0);
624 }
625 let t0 = Instant::now();
626 for _ in 0..self.iterations {
627 sim.step(1.0 / 60.0);
628 }
629 let total = t0.elapsed();
630
631 let r = GpuBenchReport {
632 name: "cuda_sph_density_cpu".to_string(),
633 backend: BackendKind::Cpu,
634 n,
635 iterations: self.iterations,
636 total,
637 mean: total / self.iterations,
638 mflops: None,
639 };
640 out.push(r.clone());
641 self.reports.push(r);
642 }
643
644 #[cfg(feature = "cuda-backend")]
646 {
647 use crate::compute::cuda_backend::{CUDA_SPH_DENSITY_SRC, CudaBackend};
648
649 if let Ok(mut backend) = CudaBackend::try_new(0) {
650 let compiled =
652 backend.compile_and_register("sph_density_kernel", CUDA_SPH_DENSITY_SRC);
653 if compiled.is_ok() {
654 let side = (n as f64).cbrt().ceil() as usize + 1;
656 let mut positions = vec![0.0_f64; n * 3];
657 for idx in 0..n {
658 positions[3 * idx] = (idx % side) as f64 * 0.1 - 5.0;
659 positions[3 * idx + 1] = ((idx / side) % side) as f64 * 0.1;
660 positions[3 * idx + 2] = (idx / (side * side)) as f64 * 0.1;
661 }
662
663 let pos_buf = backend.create_buffer(n * 3);
664 let den_buf = backend.create_buffer(n);
665 backend.write_buffer(pos_buf, &positions);
666
667 let block_x: u32 = 256;
668 let grid_x = (n as u32).div_ceil(block_x);
669
670 let n_i32 = [n as i32];
675 let scalars_f64 = [
676 cfg.smoothing_h,
677 if cfg.particle_mass > 0.0 {
682 cfg.particle_mass
683 } else {
684 1.0
685 },
686 ];
687
688 for _ in 0..self.warmup {
690 backend.launch_with_scalars(
691 "sph_density_kernel",
692 &[pos_buf, den_buf],
693 &n_i32,
694 &scalars_f64,
695 grid_x,
696 block_x,
697 );
698 backend.synchronize();
699 }
700
701 let t0 = Instant::now();
702 for _ in 0..self.iterations {
703 backend.launch_with_scalars(
704 "sph_density_kernel",
705 &[pos_buf, den_buf],
706 &n_i32,
707 &scalars_f64,
708 grid_x,
709 block_x,
710 );
711 backend.synchronize();
712 }
713 let total = t0.elapsed();
714
715 let r = GpuBenchReport {
716 name: "cuda_sph_density_gpu".to_string(),
717 backend: BackendKind::Cuda,
718 n,
719 iterations: self.iterations,
720 total,
721 mean: total / self.iterations,
722 mflops: None,
723 };
724 out.push(r.clone());
725 self.reports.push(r);
726 }
727 }
728 }
729
730 out
731 }
732
733 pub fn print_comparison(&self) {
735 println!("\n{:=<75}", "");
736 println!(
737 "{:<5} {:<22} {:>8} {:>12} {:>10}",
738 "Back", "Kernel", "N", "Mean (µs)", "MFLOPs"
739 );
740 println!("{:=<75}", "");
741 for r in &self.reports {
742 let mf = r.mflops.map_or("—".to_string(), |m| format!("{:.1}", m));
743 println!(
744 "{:<5} {:<22} {:>8} {:>12.3} {:>10}",
745 r.backend,
746 r.name,
747 r.n,
748 r.mean.as_secs_f64() * 1e6,
749 mf
750 );
751 }
752 println!("{:=<75}", "");
753 }
754}
755
756impl Default for GpuBenchHarness {
757 fn default() -> Self {
758 Self::new()
759 }
760}
761
762#[derive(Debug, Clone)]
766pub struct SpeedupReport {
767 pub cpu_mean: Duration,
769 pub wgpu_mean: Option<Duration>,
771 pub speedup: Option<f64>,
773}
774
775pub fn compute_speedup(reports: &[GpuBenchReport]) -> SpeedupReport {
790 let cpu_mean = reports.first().map(|r| r.mean).unwrap_or(Duration::ZERO);
791 let wgpu_mean = reports.get(1).map(|r| r.mean);
792 let speedup = wgpu_mean.map(|wm| {
793 if wm.as_secs_f64() > 0.0 {
794 cpu_mean.as_secs_f64() / wm.as_secs_f64()
795 } else {
796 f64::INFINITY
797 }
798 });
799 SpeedupReport {
800 cpu_mean,
801 wgpu_mean,
802 speedup,
803 }
804}
805
806#[derive(Debug, Clone)]
810pub struct CudaSpeedupReport {
811 pub cpu_mean: Duration,
813 pub cuda_mean: Option<Duration>,
815 pub speedup: Option<f64>,
817}
818
819pub fn compute_cuda_speedup(reports: &[GpuBenchReport]) -> CudaSpeedupReport {
835 let cpu_mean = reports.first().map(|r| r.mean).unwrap_or(Duration::ZERO);
836 let cuda_mean = reports.get(1).map(|r| r.mean);
837 let speedup = cuda_mean.map(|cm| {
838 if cm.as_secs_f64() > 0.0 {
839 cpu_mean.as_secs_f64() / cm.as_secs_f64()
840 } else {
841 f64::INFINITY
842 }
843 });
844 CudaSpeedupReport {
845 cpu_mean,
846 cuda_mean,
847 speedup,
848 }
849}
850
851pub fn inclusive_scan_cpu(data: &[f64]) -> Vec<f64> {
857 let mut out = Vec::with_capacity(data.len());
858 let mut acc = 0.0_f64;
859 for &v in data {
860 acc += v;
861 out.push(acc);
862 }
863 out
864}
865
866#[cfg(test)]
869mod tests {
870 use super::*;
871
872 #[test]
873 fn test_available_backends_has_cpu() {
874 let b = GpuBenchHarness::available_backends();
875 assert!(b.contains(&BackendKind::Cpu));
876 }
877
878 #[test]
879 fn test_inclusive_scan() {
880 let data = vec![1.0, 2.0, 3.0, 4.0];
881 let out = inclusive_scan_cpu(&data);
882 assert_eq!(out, vec![1.0, 3.0, 6.0, 10.0]);
883 }
884
885 #[test]
886 fn test_bench_sph_density_returns_at_least_cpu() {
887 let mut h = GpuBenchHarness {
888 warmup: 0,
889 iterations: 1,
890 reports: Vec::new(),
891 };
892 let reports = h.bench_sph_density(8);
893 assert!(!reports.is_empty());
894 assert_eq!(reports[0].backend, BackendKind::Cpu);
895 }
896
897 #[test]
898 fn test_bench_lbm_step() {
899 let mut h = GpuBenchHarness {
900 warmup: 0,
901 iterations: 1,
902 reports: Vec::new(),
903 };
904 let reports = h.bench_lbm_step(4, 4, 4);
905 assert_eq!(reports.len(), 1);
906 assert_eq!(reports[0].n, 64);
907 }
908
909 #[test]
910 fn test_bench_parallel_scan() {
911 let mut h = GpuBenchHarness {
912 warmup: 0,
913 iterations: 1,
914 reports: Vec::new(),
915 };
916 let r = h.bench_parallel_scan(100);
917 assert_eq!(r.n, 100);
918 assert!(r.mflops.is_some());
919 }
920
921 #[test]
922 fn test_run_full_suite() {
923 let mut h = GpuBenchHarness {
924 warmup: 0,
925 iterations: 1,
926 reports: Vec::new(),
927 };
928 let summary = h.run_full_suite();
929 assert!(summary.contains("benchmarks"));
930 }
931
932 #[test]
933 fn test_backend_display() {
934 assert_eq!(format!("{}", BackendKind::Cpu), "CPU");
935 assert_eq!(format!("{}", BackendKind::Wgpu), "wgpu");
936 assert_eq!(format!("{}", BackendKind::Cuda), "CUDA");
937 }
938}