oxiphysics_gpu/kernels/
mod.rs

1// Copyright 2026 COOLJAPAN OU (Team KitaSan)
2// SPDX-License-Identifier: Apache-2.0
3
4//! GPU/CPU compute kernels for physics simulation.
5//!
6//! This module groups all low-level compute kernels.  Each sub-module exposes
7//! a CPU-mock implementation that mirrors a GPU kernel in its data layout and
8//! dispatch model, but executes on the CPU using Rayon for parallelism.
9
10pub mod broadphase;
11pub mod md_force;
12pub mod rigid;
13pub mod sph;
14
15// ── Kernel registry helpers ──────────────────────────────────────────────────
16
17/// Identifier for a built-in kernel family.
18#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
19pub enum KernelFamily {
20    /// Smoothed Particle Hydrodynamics kernels.
21    Sph,
22    /// Rigid-body integration and collision kernels.
23    Rigid,
24    /// Broad-phase AABB/BVH traversal kernels.
25    Broadphase,
26    /// Molecular dynamics force kernels.
27    MdForce,
28    /// Signed distance field evaluation kernels.
29    SdfCompute,
30    /// Neural-network inference kernels.
31    NeuralCompute,
32    /// Grid-reduce / scan kernels.
33    GridReduce,
34}
35
36impl std::fmt::Display for KernelFamily {
37    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
38        let name = match self {
39            KernelFamily::Sph => "sph",
40            KernelFamily::Rigid => "rigid",
41            KernelFamily::Broadphase => "broadphase",
42            KernelFamily::MdForce => "md_force",
43            KernelFamily::SdfCompute => "sdf_compute",
44            KernelFamily::NeuralCompute => "neural_compute",
45            KernelFamily::GridReduce => "grid_reduce",
46        };
47        write!(f, "{name}")
48    }
49}
50
51// ── Dispatch descriptor ──────────────────────────────────────────────────────
52
53/// Describes the 3-D work-group dispatch dimensions for a kernel launch.
54///
55/// Mirrors the `(group_count_x, group_count_y, group_count_z)` triple passed
56/// to `vkCmdDispatch` / `wgpuComputePassEncoderDispatchWorkgroups`.
57#[derive(Debug, Clone, Copy, PartialEq, Eq)]
58pub struct DispatchDims {
59    /// Number of work-groups in X.
60    pub x: u32,
61    /// Number of work-groups in Y.
62    pub y: u32,
63    /// Number of work-groups in Z.
64    pub z: u32,
65}
66
67impl DispatchDims {
68    /// Create a 1-D dispatch of `n` work-groups.
69    pub fn linear(n: u32) -> Self {
70        Self { x: n, y: 1, z: 1 }
71    }
72
73    /// Create a 2-D dispatch.
74    pub fn grid2d(x: u32, y: u32) -> Self {
75        Self { x, y, z: 1 }
76    }
77
78    /// Create a 3-D dispatch.
79    pub fn grid3d(x: u32, y: u32, z: u32) -> Self {
80        Self { x, y, z }
81    }
82
83    /// Total number of work-groups.
84    pub fn total_groups(&self) -> u64 {
85        self.x as u64 * self.y as u64 * self.z as u64
86    }
87
88    /// Total threads given `threads_per_group`.
89    pub fn total_threads(&self, threads_per_group: u32) -> u64 {
90        self.total_groups() * threads_per_group as u64
91    }
92}
93
94/// Compute the 1-D dispatch size needed to cover `n` items with `group_size`
95/// threads per work-group.
96pub fn dispatch_size_1d(n: u32, group_size: u32) -> u32 {
97    if group_size == 0 {
98        return 0;
99    }
100    n.div_ceil(group_size)
101}
102
103// ── Kernel performance counters ──────────────────────────────────────────────
104
105/// Lightweight performance counters attached to a single kernel invocation.
106#[derive(Debug, Clone, Default)]
107pub struct KernelPerfCounters {
108    /// Number of times the kernel was dispatched.
109    pub dispatch_count: u64,
110    /// Total elements processed across all dispatches.
111    pub elements_processed: u64,
112    /// Estimated floating-point operations (MACs counted as 2 FLOPs).
113    pub flop_count: u64,
114    /// Total bytes read from global memory (mock).
115    pub bytes_read: u64,
116    /// Total bytes written to global memory (mock).
117    pub bytes_written: u64,
118}
119
120impl KernelPerfCounters {
121    /// Record one dispatch that processed `n` elements.
122    pub fn record_dispatch(&mut self, elements: u64, flops: u64, bytes_r: u64, bytes_w: u64) {
123        self.dispatch_count += 1;
124        self.elements_processed += elements;
125        self.flop_count += flops;
126        self.bytes_read += bytes_r;
127        self.bytes_written += bytes_w;
128    }
129
130    /// Arithmetic intensity (FLOPs per byte).
131    pub fn arithmetic_intensity(&self) -> f64 {
132        let bytes = self.bytes_read + self.bytes_written;
133        if bytes == 0 {
134            return 0.0;
135        }
136        self.flop_count as f64 / bytes as f64
137    }
138
139    /// Reset all counters.
140    pub fn reset(&mut self) {
141        *self = KernelPerfCounters::default();
142    }
143}
144
145// ── Shared-memory size helper ────────────────────────────────────────────────
146
147/// Calculate the shared-memory footprint (bytes) for a tiled matrix-multiply
148/// kernel with tiles of size `tile` × `tile` of `T`-sized elements.
149pub fn smem_bytes_matmul<T>(tile: usize) -> usize {
150    2 * tile * tile * std::mem::size_of::<T>()
151}
152
153// ── Barrier simulation ───────────────────────────────────────────────────────
154
155/// Simulated GPU barrier: in CPU mock this is a no-op but documents
156/// synchronisation points for future GPU backend porting.
157#[inline(always)]
158pub fn workgroup_barrier() {
159    // CPU: no-op — Rayon fork-join already provides synchronisation.
160    std::sync::atomic::fence(std::sync::atomic::Ordering::SeqCst);
161}
162
163// ── Predefined group sizes ───────────────────────────────────────────────────
164
165/// Typical work-group sizes used by NVIDIA/AMD GPUs.
166pub mod group_sizes {
167    /// 64 threads — common on AMD RDNA and for register-heavy kernels.
168    pub const WG_64: u32 = 64;
169    /// 128 threads — common general-purpose choice.
170    pub const WG_128: u32 = 128;
171    /// 256 threads — default for many CUDA/Vulkan kernels.
172    pub const WG_256: u32 = 256;
173    /// 512 threads — useful for reduction passes.
174    pub const WG_512: u32 = 512;
175    /// 1024 threads — maximum work-group size on most hardware.
176    pub const WG_1024: u32 = 1024;
177}
178
179// ── Tests ────────────────────────────────────────────────────────────────────
180
181#[cfg(test)]
182mod kernel_mod_tests {
183    use super::*;
184
185    #[test]
186    fn test_kernel_family_display() {
187        assert_eq!(KernelFamily::Sph.to_string(), "sph");
188        assert_eq!(KernelFamily::NeuralCompute.to_string(), "neural_compute");
189        assert_eq!(KernelFamily::GridReduce.to_string(), "grid_reduce");
190    }
191
192    #[test]
193    fn test_dispatch_dims_linear() {
194        let d = DispatchDims::linear(128);
195        assert_eq!(d.total_groups(), 128);
196        assert_eq!(d.total_threads(256), 128 * 256);
197    }
198
199    #[test]
200    fn test_dispatch_dims_grid3d() {
201        let d = DispatchDims::grid3d(4, 4, 4);
202        assert_eq!(d.total_groups(), 64);
203    }
204
205    #[test]
206    fn test_dispatch_size_1d_exact() {
207        assert_eq!(dispatch_size_1d(256, 64), 4);
208    }
209
210    #[test]
211    fn test_dispatch_size_1d_remainder() {
212        assert_eq!(dispatch_size_1d(257, 64), 5);
213    }
214
215    #[test]
216    fn test_dispatch_size_1d_zero_group() {
217        assert_eq!(dispatch_size_1d(100, 0), 0);
218    }
219
220    #[test]
221    fn test_perf_counters_arithmetic_intensity() {
222        let mut c = KernelPerfCounters::default();
223        c.record_dispatch(1024, 8192, 4096, 4096);
224        // intensity = 8192 / 8192 = 1.0
225        assert!((c.arithmetic_intensity() - 1.0).abs() < 1e-10);
226    }
227
228    #[test]
229    fn test_perf_counters_reset() {
230        let mut c = KernelPerfCounters::default();
231        c.record_dispatch(512, 1024, 512, 512);
232        c.reset();
233        assert_eq!(c.dispatch_count, 0);
234        assert_eq!(c.flop_count, 0);
235    }
236
237    #[test]
238    fn test_smem_bytes_matmul_f32() {
239        // 2 * 16 * 16 * 4 bytes = 2048
240        let bytes = smem_bytes_matmul::<f32>(16);
241        assert_eq!(bytes, 2048);
242    }
243
244    #[test]
245    fn test_smem_bytes_matmul_f64() {
246        // 2 * 16 * 16 * 8 bytes = 4096
247        let bytes = smem_bytes_matmul::<f64>(16);
248        assert_eq!(bytes, 4096);
249    }
250
251    #[test]
252    fn test_workgroup_barrier_no_panic() {
253        workgroup_barrier(); // must not panic
254    }
255
256    #[test]
257    fn test_group_sizes_constants() {
258        use group_sizes::*;
259        const _: () = assert!(WG_64 < WG_128);
260        const _: () = assert!(WG_128 < WG_256);
261        const _: () = assert!(WG_256 < WG_512);
262        const _: () = assert!(WG_512 < WG_1024);
263    }
264}
oxiphysics_gpu/kernels/mod.rs

oxiphysics_gpu/kernels/
mod.rs