Skip to main content

oxiphysics_gpu/fluid_sim_gpu/
functions.rs

1//! Auto-generated module
2//!
3//! 🤖 Generated with [SplitRS](https://github.com/cool-japan/splitrs)
4
5#![allow(clippy::ptr_arg)]
6use super::types::{
7    FlipParticle, GpuBoundaryBox, LbmCellType, LbmD2Q9, MacGrid, SphConfig, SphKernels, SphParticle,
8};
9
10pub(super) fn dot3(a: [f64; 3], b: [f64; 3]) -> f64 {
11    a[0] * b[0] + a[1] * b[1] + a[2] * b[2]
12}
13pub(super) fn sub3(a: [f64; 3], b: [f64; 3]) -> [f64; 3] {
14    [a[0] - b[0], a[1] - b[1], a[2] - b[2]]
15}
16pub(super) fn add3(a: [f64; 3], b: [f64; 3]) -> [f64; 3] {
17    [a[0] + b[0], a[1] + b[1], a[2] + b[2]]
18}
19pub(super) fn scale3(v: [f64; 3], s: f64) -> [f64; 3] {
20    [v[0] * s, v[1] * s, v[2] * s]
21}
22pub(super) fn length3(v: [f64; 3]) -> f64 {
23    dot3(v, v).sqrt()
24}
25pub(super) fn normalize3(v: [f64; 3]) -> [f64; 3] {
26    let len = length3(v);
27    if len < 1e-15 {
28        return [0.0; 3];
29    }
30    scale3(v, 1.0 / len)
31}
32pub(super) fn cross3(a: [f64; 3], b: [f64; 3]) -> [f64; 3] {
33    [
34        a[1] * b[2] - a[2] * b[1],
35        a[2] * b[0] - a[0] * b[2],
36        a[0] * b[1] - a[1] * b[0],
37    ]
38}
39/// Compute SPH density for all particles (mock GPU kernel dispatch).
40///
41/// For each particle i, density_i = Σ_j m_j * W_poly6(|r_i - r_j|, h)
42pub fn sph_compute_density(particles: &mut Vec<SphParticle>, config: &SphConfig) {
43    let n = particles.len();
44    let mut densities = vec![0.0f64; n];
45    for i in 0..n {
46        let mut rho = 0.0;
47        for j in 0..n {
48            let r_vec = sub3(particles[i].position, particles[j].position);
49            let r = length3(r_vec);
50            rho += particles[j].mass * SphKernels::poly6(r, config.h);
51        }
52        densities[i] = rho.max(1e-6);
53    }
54    for (i, p) in particles.iter_mut().enumerate() {
55        p.density = densities[i];
56    }
57}
58/// Compute SPH pressure from density (Tait equation).
59pub fn sph_compute_pressure(particles: &mut Vec<SphParticle>, config: &SphConfig) {
60    for p in particles.iter_mut() {
61        let ratio = p.density / config.rest_density;
62        p.pressure = config.pressure_k * (ratio.powi(7) - 1.0);
63    }
64}
65/// Compute SPH forces: pressure gradient + viscosity + gravity + surface tension.
66pub fn sph_compute_forces(particles: &mut Vec<SphParticle>, config: &SphConfig) {
67    let n = particles.len();
68    let mut forces = vec![[0.0f64; 3]; n];
69    for i in 0..n {
70        let mut f_pressure = [0.0f64; 3];
71        let mut f_viscosity = [0.0f64; 3];
72        for j in 0..n {
73            if i == j {
74                continue;
75            }
76            let r_vec = sub3(particles[i].position, particles[j].position);
77            let r = length3(r_vec);
78            if r > config.h || r < 1e-15 {
79                continue;
80            }
81            let pressure_factor = particles[i].pressure
82                / (particles[i].density * particles[i].density)
83                + particles[j].pressure / (particles[j].density * particles[j].density);
84            let grad = SphKernels::spiky_grad(r_vec, r, config.h);
85            f_pressure = add3(
86                f_pressure,
87                scale3(grad, -particles[j].mass * pressure_factor),
88            );
89            let v_diff = sub3(particles[j].velocity, particles[i].velocity);
90            let lap = SphKernels::viscosity_laplacian(r, config.h);
91            let visc_factor = config.viscosity * particles[j].mass * lap / particles[j].density;
92            f_viscosity = add3(f_viscosity, scale3(v_diff, visc_factor));
93        }
94        let f_gravity = scale3(config.gravity, particles[i].density);
95        forces[i] = add3(add3(f_pressure, f_viscosity), f_gravity);
96    }
97    for (i, p) in particles.iter_mut().enumerate() {
98        p.force = forces[i];
99    }
100}
101/// Integrate SPH particles using semi-implicit Euler.
102pub fn sph_integrate(particles: &mut Vec<SphParticle>, config: &SphConfig) {
103    for p in particles.iter_mut() {
104        let accel = scale3(p.force, 1.0 / p.density.max(1e-6));
105        p.velocity = add3(p.velocity, scale3(accel, config.dt));
106        p.position = add3(p.position, scale3(p.velocity, config.dt));
107    }
108}
109/// Full SPH step: density → pressure → forces → integrate.
110pub fn sph_step(particles: &mut Vec<SphParticle>, config: &SphConfig) {
111    sph_compute_density(particles, config);
112    sph_compute_pressure(particles, config);
113    sph_compute_forces(particles, config);
114    sph_integrate(particles, config);
115}
116/// LBM D3Q27 velocity set index count.
117pub const D3Q27_Q: usize = 27;
118/// LBM D2Q9 velocity set index count.
119pub const D2Q9_Q: usize = 9;
120/// D2Q9 x-component velocity vectors.
121pub const D2Q9_CX: [i32; 9] = [0, 1, 0, -1, 0, 1, -1, -1, 1];
122/// D2Q9 y-component velocity vectors.
123pub const D2Q9_CY: [i32; 9] = [0, 0, 1, 0, -1, 1, 1, -1, -1];
124/// D2Q9 equilibrium weights.
125pub const D2Q9_W: [f64; 9] = [
126    4.0 / 9.0,
127    1.0 / 9.0,
128    1.0 / 9.0,
129    1.0 / 9.0,
130    1.0 / 9.0,
131    1.0 / 36.0,
132    1.0 / 36.0,
133    1.0 / 36.0,
134    1.0 / 36.0,
135];
136/// D2Q9 opposite direction indices (for bounce-back).
137pub const D2Q9_OPP: [usize; 9] = [0, 3, 4, 1, 2, 7, 8, 5, 6];
138/// Semi-Lagrangian advection of a scalar field `phi` on a MAC grid.
139///
140/// Uses RK2 (midpoint method) for back-tracing.
141pub fn advect_scalar(phi: &[f64], grid: &MacGrid, dt: f64, gravity: [f64; 3]) -> Vec<f64> {
142    let nx = grid.nx;
143    let ny = grid.ny;
144    let nz = grid.nz;
145    let dx = grid.dx;
146    let mut phi_new = vec![0.0f64; nx * ny * nz];
147    for k in 0..nz {
148        for j in 0..ny {
149            for i in 0..nx {
150                let xc = (i as f64 + 0.5) * dx;
151                let yc = (j as f64 + 0.5) * dx;
152                let zc = (k as f64 + 0.5) * dx;
153                let u0 = grid.interp_u(xc, yc, zc);
154                let x_mid = xc - 0.5 * dt * (u0 + gravity[0]);
155                let y_mid = yc - 0.5 * dt * gravity[1];
156                let z_mid = zc - 0.5 * dt * gravity[2];
157                let u_mid = grid.interp_u(x_mid, y_mid, z_mid);
158                let x_back = xc - dt * (u_mid + gravity[0]);
159                let y_back = yc - dt * gravity[1];
160                let z_back = zc - dt * gravity[2];
161                let ix = (x_back / dx - 0.5).floor() as isize;
162                let iy = (y_back / dx - 0.5).floor() as isize;
163                let iz = (z_back / dx - 0.5).floor() as isize;
164                let ii = ix.clamp(0, nx as isize - 1) as usize;
165                let jj = iy.clamp(0, ny as isize - 1) as usize;
166                let kk = iz.clamp(0, nz as isize - 1) as usize;
167                phi_new[k * nx * ny + j * nx + i] = phi[kk * nx * ny + jj * nx + ii];
168            }
169        }
170    }
171    phi_new
172}
173/// Compute curl (vorticity) of the velocity field at cell centers.
174///
175/// Returns a flat buffer of 3-vectors (ωx, ωy, ωz).
176pub fn compute_vorticity(grid: &MacGrid) -> Vec<[f64; 3]> {
177    let nx = grid.nx;
178    let ny = grid.ny;
179    let nz = grid.nz;
180    let inv2dx = 1.0 / (2.0 * grid.dx);
181    let mut vorticity = vec![[0.0f64; 3]; nx * ny * nz];
182    for k in 1..nz - 1 {
183        for j in 1..ny - 1 {
184            for i in 1..nx - 1 {
185                let dwdy = (grid.get_w(i, j + 1, k) - grid.get_w(i, j - 1, k)) * inv2dx;
186                let dvdz = (grid.get_v(i, j, k + 1) - grid.get_v(i, j, k - 1)) * inv2dx;
187                let dudz = (grid.get_u(i, j, k + 1) - grid.get_u(i, j, k - 1)) * inv2dx;
188                let dwdx = (grid.get_w(i + 1, j, k) - grid.get_w(i - 1, j, k)) * inv2dx;
189                let dvdx = (grid.get_v(i + 1, j, k) - grid.get_v(i - 1, j, k)) * inv2dx;
190                let dudy = (grid.get_u(i, j + 1, k) - grid.get_u(i, j - 1, k)) * inv2dx;
191                let idx = k * nx * ny + j * nx + i;
192                vorticity[idx] = [dwdy - dvdz, dudz - dwdx, dvdx - dudy];
193            }
194        }
195    }
196    vorticity
197}
198/// Apply vorticity confinement forces to MAC grid velocities.
199///
200/// Vorticity confinement adds a force F = ε * (N × ω)
201/// where N is the normalized gradient of |ω|.
202pub fn vorticity_confinement(grid: &mut MacGrid, vorticity: &[[f64; 3]], epsilon: f64, dt: f64) {
203    let nx = grid.nx;
204    let ny = grid.ny;
205    let nz = grid.nz;
206    let inv2dx = 1.0 / (2.0 * grid.dx);
207    for k in 1..nz - 1 {
208        for j in 1..ny - 1 {
209            for i in 1..nx - 1 {
210                let mag = |x: usize, y: usize, z: usize| {
211                    let idx = z * nx * ny + y * nx + x;
212                    length3(vorticity[idx])
213                };
214                let gx = (mag(i + 1, j, k) - mag(i - 1, j, k)) * inv2dx;
215                let gy = (mag(i, j + 1, k) - mag(i, j - 1, k)) * inv2dx;
216                let gz = (mag(i, j, k + 1) - mag(i, j, k - 1)) * inv2dx;
217                let grad_len = (gx * gx + gy * gy + gz * gz).sqrt();
218                if grad_len < 1e-12 {
219                    continue;
220                }
221                let n_vec = [gx / grad_len, gy / grad_len, gz / grad_len];
222                let idx = k * nx * ny + j * nx + i;
223                let omega = vorticity[idx];
224                let force = cross3(n_vec, omega);
225                let force = scale3(force, epsilon);
226                if i < nx {
227                    let u = grid.get_u(i, j, k);
228                    grid.set_u(i, j, k, u + dt * force[0]);
229                }
230                if j < ny {
231                    let v = grid.get_v(i, j, k);
232                    grid.set_v(i, j, k, v + dt * force[1]);
233                }
234                if k < nz {
235                    let w = grid.get_w(i, j, k);
236                    grid.set_w(i, j, k, w + dt * force[2]);
237                }
238            }
239        }
240    }
241}
242/// Surface tension force via Continuum Surface Force (CSF) method.
243///
244/// Requires a level-set function `phi` (negative inside fluid, positive outside).
245/// Returns per-cell force vectors.
246pub fn surface_tension_csf(
247    phi: &[f64],
248    nx: usize,
249    ny: usize,
250    nz: usize,
251    dx: f64,
252    sigma: f64,
253) -> Vec<[f64; 3]> {
254    let inv_dx = 1.0 / dx;
255    let inv2dx = 0.5 * inv_dx;
256    let n = nx * ny * nz;
257    let mut forces = vec![[0.0f64; 3]; n];
258    let idx = |i: usize, j: usize, k: usize| k * nx * ny + j * nx + i;
259    for k in 1..nz - 1 {
260        for j in 1..ny - 1 {
261            for i in 1..nx - 1 {
262                let c = idx(i, j, k);
263                let gx = (phi[idx(i + 1, j, k)] - phi[idx(i - 1, j, k)]) * inv2dx;
264                let gy = (phi[idx(i, j + 1, k)] - phi[idx(i, j - 1, k)]) * inv2dx;
265                let gz = (phi[idx(i, j, k + 1)] - phi[idx(i, j, k - 1)]) * inv2dx;
266                let grad_mag = (gx * gx + gy * gy + gz * gz).sqrt();
267                if grad_mag < 1e-12 {
268                    continue;
269                }
270                let nx_n = gx / grad_mag;
271                let ny_n = gy / grad_mag;
272                let nz_n = gz / grad_mag;
273                let phi_xx = (phi[idx(i + 1, j, k)] - 2.0 * phi[c] + phi[idx(i - 1, j, k)])
274                    * inv_dx
275                    * inv_dx;
276                let phi_yy = (phi[idx(i, j + 1, k)] - 2.0 * phi[c] + phi[idx(i, j - 1, k)])
277                    * inv_dx
278                    * inv_dx;
279                let phi_zz = (phi[idx(i, j, k + 1)] - 2.0 * phi[c] + phi[idx(i, j, k - 1)])
280                    * inv_dx
281                    * inv_dx;
282                let kappa = -(phi_xx + phi_yy + phi_zz) / grad_mag;
283                let f_scale = sigma * kappa;
284                forces[c] = [f_scale * nx_n, f_scale * ny_n, f_scale * nz_n];
285            }
286        }
287    }
288    forces
289}
290/// Particle-to-Grid (P2G) transfer: splat particle velocities onto MAC grid.
291///
292/// Uses trilinear weighting.
293pub fn p2g_transfer(particles: &[FlipParticle], grid: &mut MacGrid) {
294    let dx = grid.dx;
295    let inv_dx = 1.0 / dx;
296    let nx = grid.nx;
297    let ny = grid.ny;
298    let nz = grid.nz;
299    let mut u_weight = vec![0.0f64; (nx + 1) * ny * nz];
300    let mut v_weight = vec![0.0f64; nx * (ny + 1) * nz];
301    let mut w_weight = vec![0.0f64; nx * ny * (nz + 1)];
302    let mut u_num = vec![0.0f64; (nx + 1) * ny * nz];
303    let mut v_num = vec![0.0f64; nx * (ny + 1) * nz];
304    let mut w_num = vec![0.0f64; nx * ny * (nz + 1)];
305    for p in particles {
306        let [px, py, pz] = p.position;
307        let iu = (px * inv_dx).floor() as isize;
308        let ju = (py * inv_dx - 0.5).floor() as isize;
309        let ku = (pz * inv_dx - 0.5).floor() as isize;
310        let fxu = px * inv_dx - iu as f64;
311        let fyu = py * inv_dx - 0.5 - ju as f64;
312        let fzu = pz * inv_dx - 0.5 - ku as f64;
313        for dz in 0..2 {
314            for dy in 0..2 {
315                for dx_off in 0..2 {
316                    let ni = (iu + dx_off as isize).clamp(0, nx as isize) as usize;
317                    let nj = (ju + dy as isize).clamp(0, ny as isize - 1) as usize;
318                    let nk = (ku + dz as isize).clamp(0, nz as isize - 1) as usize;
319                    let wx = if dx_off == 0 { 1.0 - fxu } else { fxu };
320                    let wy = if dy == 0 { 1.0 - fyu } else { fyu };
321                    let wz = if dz == 0 { 1.0 - fzu } else { fzu };
322                    let w = wx * wy * wz;
323                    let uidx = nk * (nx + 1) * ny + nj * (nx + 1) + ni;
324                    u_num[uidx] += w * p.velocity[0];
325                    u_weight[uidx] += w;
326                }
327            }
328        }
329        let iv = (px * inv_dx - 0.5).floor() as isize;
330        let jv = (py * inv_dx).floor() as isize;
331        let kv = (pz * inv_dx - 0.5).floor() as isize;
332        let fxv = px * inv_dx - 0.5 - iv as f64;
333        let fyv = py * inv_dx - jv as f64;
334        let fzv = pz * inv_dx - 0.5 - kv as f64;
335        for dz in 0..2 {
336            for dy in 0..2 {
337                for dx_off in 0..2 {
338                    let ni = (iv + dx_off as isize).clamp(0, nx as isize - 1) as usize;
339                    let nj = (jv + dy as isize).clamp(0, ny as isize) as usize;
340                    let nk = (kv + dz as isize).clamp(0, nz as isize - 1) as usize;
341                    let wx = if dx_off == 0 { 1.0 - fxv } else { fxv };
342                    let wy = if dy == 0 { 1.0 - fyv } else { fyv };
343                    let wz = if dz == 0 { 1.0 - fzv } else { fzv };
344                    let wt = wx * wy * wz;
345                    let vidx = nk * nx * (ny + 1) + nj * nx + ni;
346                    v_num[vidx] += wt * p.velocity[1];
347                    v_weight[vidx] += wt;
348                }
349            }
350        }
351        let iw = (px * inv_dx - 0.5).floor() as isize;
352        let jw = (py * inv_dx - 0.5).floor() as isize;
353        let kw = (pz * inv_dx).floor() as isize;
354        let fxw = px * inv_dx - 0.5 - iw as f64;
355        let fyw = py * inv_dx - 0.5 - jw as f64;
356        let fzw = pz * inv_dx - kw as f64;
357        for dz in 0..2 {
358            for dy in 0..2 {
359                for dx_off in 0..2 {
360                    let ni = (iw + dx_off as isize).clamp(0, nx as isize - 1) as usize;
361                    let nj = (jw + dy as isize).clamp(0, ny as isize - 1) as usize;
362                    let nk = (kw + dz as isize).clamp(0, nz as isize) as usize;
363                    let wx = if dx_off == 0 { 1.0 - fxw } else { fxw };
364                    let wy = if dy == 0 { 1.0 - fyw } else { fyw };
365                    let wz = if dz == 0 { 1.0 - fzw } else { fzw };
366                    let wt = wx * wy * wz;
367                    let widx = nk * nx * ny + nj * nx + ni;
368                    w_num[widx] += wt * p.velocity[2];
369                    w_weight[widx] += wt;
370                }
371            }
372        }
373    }
374    for i in 0..u_num.len() {
375        grid.u[i] = if u_weight[i] > 1e-15 {
376            u_num[i] / u_weight[i]
377        } else {
378            0.0
379        };
380    }
381    for i in 0..v_num.len() {
382        grid.v[i] = if v_weight[i] > 1e-15 {
383            v_num[i] / v_weight[i]
384        } else {
385            0.0
386        };
387    }
388    for i in 0..w_num.len() {
389        grid.w[i] = if w_weight[i] > 1e-15 {
390            w_num[i] / w_weight[i]
391        } else {
392            0.0
393        };
394    }
395}
396/// Grid-to-Particle (G2P) transfer: update particle velocities from MAC grid.
397///
398/// `flip_ratio` in \[0,1\]: 1.0 = pure FLIP, 0.0 = pure PIC.
399pub fn g2p_transfer(
400    particles: &mut [FlipParticle],
401    grid_new: &MacGrid,
402    grid_old: &MacGrid,
403    flip_ratio: f64,
404) {
405    for p in particles.iter_mut() {
406        let [px, py, pz] = p.position;
407        let dx = grid_new.dx;
408        let u_new = grid_new.interp_u(px, py, pz);
409        let u_old = grid_old.interp_u(px, py, pz);
410        let pic_vel = [u_new, 0.0, 0.0];
411        let flip_delta = [u_new - u_old, 0.0, 0.0];
412        let flip_vel = add3(p.velocity, flip_delta);
413        p.velocity = [
414            flip_ratio * flip_vel[0] + (1.0 - flip_ratio) * pic_vel[0],
415            p.velocity[1],
416            p.velocity[2],
417        ];
418        p.position = add3(p.position, scale3(p.velocity, dx));
419    }
420}
421/// GPU-accelerated SPH density summation.
422///
423/// Simulates a GPU kernel where each "thread" computes the density contribution
424/// from all neighbours within the smoothing radius h. In a real GPU
425/// implementation this would be launched as one thread per particle pair with
426/// atomic accumulation. Here we use a parallel-style double loop and write
427/// results into a temporary buffer first to keep the interface identical to a
428/// real GPU dispatch.
429pub fn gpu_sph_density_parallel(particles: &mut Vec<SphParticle>, config: &SphConfig) {
430    let n = particles.len();
431    let mut densities = vec![0.0f64; n];
432    for i in 0..n {
433        let mut rho = 0.0;
434        for j in 0..n {
435            let r_vec = sub3(particles[i].position, particles[j].position);
436            let r = length3(r_vec);
437            rho += particles[j].mass * SphKernels::poly6(r, config.h);
438        }
439        densities[i] = rho.max(1e-6);
440    }
441    for (i, p) in particles.iter_mut().enumerate() {
442        p.density = densities[i];
443    }
444}
445/// GPU Jacobi pressure solver on a MAC grid.
446///
447/// Identical algorithm to `MacGrid::jacobi_pressure_solve` but expressed as a
448/// function that could be launched as a compute shader (one thread per cell).
449/// The indirection through a separate function makes the GPU dispatch boundary
450/// explicit.
451pub fn gpu_jacobi_pressure_solve(grid: &mut MacGrid, rho: f64, dt: f64, iterations: usize) {
452    grid.compute_divergence();
453    let scale = rho * grid.dx * grid.dx / dt;
454    let nx = grid.nx;
455    let ny = grid.ny;
456    let nz = grid.nz;
457    let mut p_new = grid.p.clone();
458    for _ in 0..iterations {
459        for k in 0..nz {
460            for j in 0..ny {
461                for i in 0..nx {
462                    let idx = k * nx * ny + j * nx + i;
463                    if grid.flags[idx] != 1 {
464                        continue;
465                    }
466                    let mut nb_sum = 0.0;
467                    let mut nb_cnt = 0u32;
468                    macro_rules! nb {
469                        ($ii:expr, $jj:expr, $kk:expr) => {{
470                            nb_sum += grid.p[$kk * nx * ny + $jj * nx + $ii];
471                            nb_cnt += 1;
472                        }};
473                    }
474                    if i + 1 < nx {
475                        nb!(i + 1, j, k);
476                    }
477                    if i > 0 {
478                        nb!(i - 1, j, k);
479                    }
480                    if j + 1 < ny {
481                        nb!(i, j + 1, k);
482                    }
483                    if j > 0 {
484                        nb!(i, j - 1, k);
485                    }
486                    if k + 1 < nz {
487                        nb!(i, j, k + 1);
488                    }
489                    if k > 0 {
490                        nb!(i, j, k - 1);
491                    }
492                    if nb_cnt > 0 {
493                        p_new[idx] = (nb_sum - scale * grid.div[idx]) / nb_cnt as f64;
494                    }
495                }
496            }
497        }
498        grid.p.copy_from_slice(&p_new);
499    }
500}
501/// GPU-style BGK collision step for the D2Q9 LBM grid.
502///
503/// In a real GPU implementation each cell maps to one thread.  The function
504/// signature mirrors a compute-shader kernel: it takes the whole grid and
505/// applies BGK in-place.
506pub fn gpu_lbm_bgk_collide(lbm: &mut LbmD2Q9) {
507    let nx = lbm.nx;
508    let ny = lbm.ny;
509    let inv_tau = lbm.inv_tau;
510    let mut updates: Vec<(usize, usize, usize, f64)> = Vec::with_capacity(nx * ny * D2Q9_Q);
511    for y in 0..ny {
512        for x in 0..nx {
513            if lbm.cell_type[y * nx + x] == LbmCellType::Solid {
514                continue;
515            }
516            let rho = lbm.density(x, y);
517            let [ux, uy] = lbm.velocity(x, y);
518            for q in 0..D2Q9_Q {
519                let f_eq = LbmD2Q9::f_equilibrium(rho, ux, uy, q);
520                let f_old = lbm.get_f(x, y, q);
521                let f_new = f_old - inv_tau * (f_old - f_eq);
522                updates.push((x, y, q, f_new));
523            }
524        }
525    }
526    for (x, y, q, val) in updates {
527        lbm.set_f(x, y, q, val);
528    }
529}
530/// Expand a 10-bit integer into a 30-bit Morton code component (bit interleave).
531pub fn morton_expand_bits(mut v: u32) -> u32 {
532    v &= 0x000003ff;
533    v = (v | (v << 16)) & 0xff0000ff;
534    v = (v | (v << 8)) & 0x0300f00f;
535    v = (v | (v << 4)) & 0x030c30c3;
536    v = (v | (v << 2)) & 0x09249249;
537    v
538}
539/// Encode 3D integer coordinates into a 30-bit Morton (Z-order curve) code.
540pub fn morton_encode_3d(x: u32, y: u32, z: u32) -> u32 {
541    morton_expand_bits(x) | (morton_expand_bits(y) << 1) | (morton_expand_bits(z) << 2)
542}
543/// Sort SPH particles by their Morton code for cache-friendly GPU access.
544///
545/// Positions are normalised into the unit cube `[0, domain_size]` and then
546/// quantised to 10 bits per axis (1024 levels) before Morton encoding.
547pub fn morton_sort_particles(particles: &mut Vec<SphParticle>, domain_size: [f64; 3]) {
548    let bits = 1024u32;
549    let mut keyed: Vec<(u32, SphParticle)> = particles
550        .drain(..)
551        .map(|p| {
552            let xi = ((p.position[0] / domain_size[0]).clamp(0.0, 1.0) * (bits - 1) as f64) as u32;
553            let yi = ((p.position[1] / domain_size[1]).clamp(0.0, 1.0) * (bits - 1) as f64) as u32;
554            let zi = ((p.position[2] / domain_size[2]).clamp(0.0, 1.0) * (bits - 1) as f64) as u32;
555            (morton_encode_3d(xi, yi, zi), p)
556        })
557        .collect();
558    keyed.sort_by_key(|(code, _)| *code);
559    *particles = keyed.into_iter().map(|(_, p)| p).collect();
560}
561/// GPU Euler integration: v += a*dt, x += v*dt.
562///
563/// Maps to one GPU thread per particle.
564pub fn gpu_particle_integrate_euler(particles: &mut Vec<SphParticle>, dt: f64) {
565    for p in particles.iter_mut() {
566        let inv_rho = 1.0 / p.density.max(1e-6);
567        for d in 0..3 {
568            let accel = p.force[d] * inv_rho;
569            p.velocity[d] += accel * dt;
570            p.position[d] += p.velocity[d] * dt;
571        }
572    }
573}
574/// GPU Verlet integration using the previous time-step `dt_prev`.
575///
576/// x_new = x + v*dt + 0.5*a*dt^2 (Störmer–Verlet, velocity-explicit variant).
577pub fn gpu_particle_integrate_verlet(particles: &mut Vec<SphParticle>, dt: f64, _dt_prev: f64) {
578    for p in particles.iter_mut() {
579        let inv_rho = 1.0 / p.density.max(1e-6);
580        for d in 0..3 {
581            let accel = p.force[d] * inv_rho;
582            p.position[d] += p.velocity[d] * dt + 0.5 * accel * dt * dt;
583            p.velocity[d] += accel * dt;
584        }
585    }
586}
587/// GPU boundary condition: clamp particles inside an AABB and reflect velocities.
588///
589/// One GPU thread per particle; no inter-thread communication needed.
590pub fn gpu_apply_boundary_box(particles: &mut Vec<SphParticle>, bounds: &GpuBoundaryBox) {
591    for p in particles.iter_mut() {
592        for d in 0..3 {
593            if p.position[d] < bounds.min[d] {
594                p.position[d] = bounds.min[d];
595                p.velocity[d] = p.velocity[d].abs() * bounds.restitution;
596            }
597            if p.position[d] > bounds.max[d] {
598                p.position[d] = bounds.max[d];
599                p.velocity[d] = -p.velocity[d].abs() * bounds.restitution;
600            }
601        }
602    }
603}
604/// GPU parallel reduction for total kinetic energy.
605///
606/// In GPU hardware this would be a tree-reduction across threads in a work-group;
607/// here we use an iterator fold which has the same semantics.
608///
609/// KE = Σ 0.5 * mass * |v|²
610pub fn gpu_reduce_kinetic_energy(particles: &[SphParticle], mass: f64) -> f64 {
611    particles.iter().fold(0.0, |acc, p| {
612        let v2 = p.velocity[0] * p.velocity[0]
613            + p.velocity[1] * p.velocity[1]
614            + p.velocity[2] * p.velocity[2];
615        acc + 0.5 * mass * v2
616    })
617}
618/// GPU parallel reduction for total linear momentum.
619///
620/// Returns a 3-vector \[px, py, pz\] = Σ mass * v_i.
621pub fn gpu_reduce_momentum(particles: &[SphParticle], mass: f64) -> [f64; 3] {
622    particles.iter().fold([0.0f64; 3], |acc, p| {
623        [
624            acc[0] + mass * p.velocity[0],
625            acc[1] + mass * p.velocity[1],
626            acc[2] + mass * p.velocity[2],
627        ]
628    })
629}
630/// GPU semi-Lagrangian scalar field advection on a 2D regular grid.
631///
632/// Each cell is a separate GPU thread: back-traces the characteristic by dt
633/// through the supplied velocity field `vel` (one 2-vector per cell) and
634/// samples the field using bilinear interpolation.
635pub fn gpu_advect_2d(
636    field: &[f64],
637    vel: &[[f64; 2]],
638    nx: usize,
639    ny: usize,
640    dx: f64,
641    dt: f64,
642) -> Vec<f64> {
643    let mut out = vec![0.0f64; nx * ny];
644    let inv_dx = 1.0 / dx;
645    let sample = |x: f64, y: f64| -> f64 {
646        let ix = (x * inv_dx - 0.5).floor() as isize;
647        let iy = (y * inv_dx - 0.5).floor() as isize;
648        let fx = x * inv_dx - 0.5 - ix as f64;
649        let fy = y * inv_dx - 0.5 - iy as f64;
650        let ci = |v: isize| v.clamp(0, nx as isize - 1) as usize;
651        let cj = |v: isize| v.clamp(0, ny as isize - 1) as usize;
652        let f00 = field[cj(iy) * nx + ci(ix)];
653        let f10 = field[cj(iy) * nx + ci(ix + 1)];
654        let f01 = field[cj(iy + 1) * nx + ci(ix)];
655        let f11 = field[cj(iy + 1) * nx + ci(ix + 1)];
656        let f0 = f00 + fx * (f10 - f00);
657        let f1 = f01 + fx * (f11 - f01);
658        f0 + fy * (f1 - f0)
659    };
660    for j in 0..ny {
661        for i in 0..nx {
662            let xc = (i as f64 + 0.5) * dx;
663            let yc = (j as f64 + 0.5) * dx;
664            let idx = j * nx + i;
665            let [vx, vy] = vel[idx];
666            let xb = xc - vx * dt;
667            let yb = yc - vy * dt;
668            out[idx] = sample(xb, yb);
669        }
670    }
671    out
672}
673/// GPU Jacobi solver for the 2D pressure-Poisson equation on a staggered grid.
674///
675/// Solves ∇²p = rhs (passed as `div`) for `iterations` Jacobi sweeps.
676/// The staggered MAC discretisation gives the standard 5-point stencil:
677///   p\[i,j\] = (p\[i+1,j\] + p\[i-1,j\] + p\[i,j+1\] + p\[i,j-1\] - dx²*rhs\[i,j\]) / 4
678pub fn gpu_pressure_poisson_jacobi_2d(
679    pressure: &mut Vec<f64>,
680    div: &[f64],
681    nx: usize,
682    ny: usize,
683    dx: f64,
684    iterations: usize,
685) {
686    let dx2 = dx * dx;
687    let mut p_new = pressure.clone();
688    for _ in 0..iterations {
689        for j in 0..ny {
690            for i in 0..nx {
691                let idx = j * nx + i;
692                let mut nb = 0.0;
693                let mut cnt = 0u32;
694                if i + 1 < nx {
695                    nb += pressure[j * nx + i + 1];
696                    cnt += 1;
697                }
698                if i > 0 {
699                    nb += pressure[j * nx + i - 1];
700                    cnt += 1;
701                }
702                if j + 1 < ny {
703                    nb += pressure[(j + 1) * nx + i];
704                    cnt += 1;
705                }
706                if j > 0 {
707                    nb += pressure[(j - 1) * nx + i];
708                    cnt += 1;
709                }
710                if cnt > 0 {
711                    p_new[idx] = (nb - dx2 * div[idx]) / cnt as f64;
712                }
713            }
714        }
715        pressure.copy_from_slice(&p_new);
716    }
717}