use crate::{Real, RealConvert};
use nalgebra::Vector3;
use numeric_literals::replace_float_literals;
#[cfg(all(target_arch = "aarch64"))]
use core::arch::aarch64::float32x4_t;
#[cfg(target_arch = "x86")]
use core::arch::x86::__m256;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::__m256;
pub struct Volume;
impl Volume {
pub fn cube_particle<R: Real>(particle_radius: R) -> R {
(particle_radius + particle_radius).powi(3)
}
pub fn sphere_particle<R: Real>(particle_radius: R) -> R {
R::from_float(4.0) * R::frac_pi_3() * particle_radius.powi(3)
}
}
pub trait SymmetricKernel3d<R: Real> {
fn compact_support_radius(&self) -> R;
fn evaluate(&self, r: R) -> R;
fn evaluate_gradient(&self, x: Vector3<R>) -> Vector3<R>;
fn evaluate_gradient_norm(&self, r: R) -> R;
}
pub struct CubicSplineKernel<R: Real> {
compact_support_radius: R,
normalization: R,
}
impl<R: Real> CubicSplineKernel<R> {
#[replace_float_literals(R::from_float(literal))]
pub fn new(compact_support_radius: R) -> Self {
let h = compact_support_radius;
let sigma = 8.0 / (h * h * h);
Self {
compact_support_radius,
normalization: sigma,
}
}
#[replace_float_literals(R::from_float(literal))]
fn cubic_function(q: R) -> R {
if q < R::one() {
(3.0 / (2.0 * R::pi())) * ((2.0 / 3.0) - q * q + 0.5 * q * q * q)
} else if q < 2.0 {
let x = 2.0 - q;
return (R::one() / (4.0 * R::pi())) * x * x * x;
} else {
return 0.0;
}
}
#[replace_float_literals(R::from_float(literal))]
fn cubic_function_dq(q: R) -> R {
if q < 1.0 {
(3.0 / (4.0 * R::pi())) * (-4.0 * q + 3.0 * q * q)
} else if q < 2.0 {
let x = 2.0 - q;
return -(3.0 / (4.0 * R::pi())) * x * x;
} else {
return 0.0;
}
}
}
impl<R: Real> SymmetricKernel3d<R> for CubicSplineKernel<R> {
fn compact_support_radius(&self) -> R {
self.compact_support_radius
}
fn evaluate(&self, r: R) -> R {
let q = (r + r) / self.compact_support_radius;
self.normalization * Self::cubic_function(q)
}
fn evaluate_gradient(&self, x: Vector3<R>) -> Vector3<R> {
let r = x.norm();
let drdx = x.unscale(r);
let q = (r + r) / self.compact_support_radius;
let dfdq = Self::cubic_function_dq(q);
let dqdr = (R::one() + R::one()) / self.compact_support_radius;
drdx.scale(self.normalization * dfdq * dqdr)
}
fn evaluate_gradient_norm(&self, r: R) -> R {
let q = (r + r) / self.compact_support_radius;
let dfdq = Self::cubic_function_dq(q);
let dqdr = (R::one() + R::one()) / self.compact_support_radius;
self.normalization * dfdq * dqdr
}
}
#[test]
fn test_cubic_kernel_r_compact_support() {
let hs = [0.025, 0.1, 2.0];
for &h in hs.iter() {
let kernel = CubicSplineKernel::new(h);
assert_eq!(kernel.evaluate(h), 0.0);
assert_eq!(kernel.evaluate(2.0 * h), 0.0);
assert_eq!(kernel.evaluate(10.0 * h), 0.0);
}
}
#[test]
fn test_cubic_kernel_r_integral() {
let hs = [0.025, 0.1, 2.0];
let n = 10;
for &h in hs.iter() {
let kernel = CubicSplineKernel::new(h);
let dr = h / (n as f64);
let dvol = dr * dr * dr;
let mut integral = 0.0;
for i in -n..n {
for j in -n..n {
for k in -n..n {
let r_in = Vector3::new(i as f64, j as f64, k as f64) * dr;
let r_out = Vector3::new((i + 1) as f64, (j + 1) as f64, (k + 1) as f64) * dr;
let r = ((r_in + r_out) * 0.5).norm();
integral += dvol * kernel.evaluate(r);
}
}
}
assert!((integral - 1.0).abs() <= 1e-5);
}
}
#[cfg(target_arch = "aarch64")]
pub struct CubicSplineKernelNeonF32 {
compact_support_inv: f32,
sigma: f32,
}
#[cfg(target_arch = "aarch64")]
impl CubicSplineKernelNeonF32 {
pub fn new(compact_support_radius: f32) -> Self {
let r = compact_support_radius;
let compact_support_inv = 1.0 / r;
let rrr = r * r * r;
let sigma = 8.0 / (std::f32::consts::PI * rrr);
Self {
compact_support_inv,
sigma,
}
}
#[target_feature(enable = "neon")]
pub fn evaluate(&self, r: float32x4_t) -> float32x4_t {
use core::arch::aarch64::*;
let one = vdupq_n_f32(1.0);
let half = vdupq_n_f32(0.5);
let zero = vdupq_n_f32(0.0);
let q = vmulq_n_f32(r, self.compact_support_inv);
let v = vsubq_f32(one, q);
let v = vmaxq_f32(v, zero);
let v2 = vmulq_f32(v, v);
let v3 = vmulq_f32(v2, v);
let res_outer = vmulq_n_f32(v3, 2.0 * self.sigma);
let mut res_inner = vdupq_n_f32(self.sigma);
res_inner = vmlsq_n_f32(res_inner, v, 6.0 * self.sigma); res_inner = vmlaq_n_f32(res_inner, v2, 12.0 * self.sigma); res_inner = vmlsq_n_f32(res_inner, v3, 6.0 * self.sigma);
let leq_than_half = vcleq_f32(q, half);
vbslq_f32(leq_than_half, res_inner, res_outer)
}
}
#[test]
#[cfg_attr(
not(all(target_arch = "aarch64", target_feature = "neon")),
ignore = "Skipped on non-aarch64 targets"
)]
fn test_cubic_spline_kernel_neon() {
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
{
use core::arch::aarch64::*;
let hs: [f32; 3] = [0.025, 0.1, 2.0];
for &h in hs.iter() {
let scalar = CubicSplineKernel::new(h);
let neon = CubicSplineKernelNeonF32::new(h);
let n: usize = 1024;
let mut r0: f32 = 0.0;
let dr: f32 = (2.0 * h) / (n as f32);
for _chunk in 0..(n / 4) {
let rs = [r0, r0 + dr, r0 + 2.0 * dr, r0 + 3.0 * dr];
let r_vec = unsafe { vld1q_f32(rs.as_ptr()) };
let w_vec = unsafe { neon.evaluate(r_vec) };
let mut w_neon = [0.0f32; 4];
unsafe { vst1q_f32(w_neon.as_mut_ptr(), w_vec) };
for lane in 0..4 {
let r_lane = rs[lane];
let w_scalar = scalar.evaluate(r_lane);
let diff = (w_neon[lane] - w_scalar).abs();
let tol = 5e-6_f32.max(2e-5_f32 * w_scalar.abs());
assert!(
diff <= tol,
"NEON kernel mismatch (h={}, r={}, lane={}): neon={}, scalar={}, diff={}, tol={}",
h,
r_lane,
lane,
w_neon[lane],
w_scalar,
diff,
tol
);
}
r0 += 4.0 * dr;
}
for &r in &[h * 1.01, h * 1.5, h * 2.0, h * 2.5] {
let w_scalar = scalar.evaluate(r);
let w_neon = {
let v = unsafe { vld1q_f32([r, r, r, r].as_ptr()) };
let w = unsafe { neon.evaluate(v) };
let mut tmp = [0.0f32; 4];
unsafe { vst1q_f32(tmp.as_mut_ptr(), w) };
tmp[0]
};
let diff = (w_neon - w_scalar).abs();
let tol = 5e-6_f32.max(1e-5_f32 * w_scalar.abs());
assert!(
diff <= tol,
"NEON kernel mismatch outside support (h={}, r={}): neon={}, scalar={}, diff={}, tol={}",
h,
r,
w_neon,
w_scalar,
diff,
tol
);
}
}
}
}
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
pub struct CubicSplineKernelAvxF32 {
compact_support_inv: f32,
sigma: f32,
}
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
impl CubicSplineKernelAvxF32 {
pub fn new(compact_support_radius: f32) -> Self {
let r = compact_support_radius;
let compact_support_inv = 1.0 / r;
let rrr = r * r * r;
let sigma = 8.0 / (std::f32::consts::PI * rrr);
Self {
compact_support_inv,
sigma,
}
}
#[target_feature(enable = "avx2,fma")]
pub fn evaluate(&self, r: __m256) -> __m256 {
#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
let one = _mm256_set1_ps(1.0);
let half = _mm256_set1_ps(0.5);
let zero = _mm256_set1_ps(0.0);
let q = _mm256_mul_ps(r, _mm256_set1_ps(self.compact_support_inv));
let mut v = _mm256_sub_ps(one, q);
v = _mm256_max_ps(v, zero);
let v2 = _mm256_mul_ps(v, v);
let v3 = _mm256_mul_ps(v2, v);
let res_outer = _mm256_mul_ps(v3, _mm256_set1_ps(2.0 * self.sigma));
let mut res_inner = _mm256_set1_ps(self.sigma);
res_inner = _mm256_fnmadd_ps(v, _mm256_set1_ps(6.0 * self.sigma), res_inner);
res_inner = _mm256_fmadd_ps(v2, _mm256_set1_ps(12.0 * self.sigma), res_inner);
res_inner = _mm256_fnmadd_ps(v3, _mm256_set1_ps(6.0 * self.sigma), res_inner);
let leq_than_half = _mm256_cmp_ps::<_CMP_LE_OQ>(q, half);
_mm256_blendv_ps(res_outer, res_inner, leq_than_half)
}
}
#[test]
#[cfg_attr(
not(all(
any(target_arch = "x86_64", target_arch = "x86"),
target_feature = "avx2",
target_feature = "fma"
)),
ignore = "Skipped on non-x86 targets"
)]
fn test_cubic_spline_kernel_avx() {
#[cfg(all(
any(target_arch = "x86_64", target_arch = "x86"),
target_feature = "avx2",
target_feature = "fma"
))]
{
#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
let hs: [f32; 3] = [0.025, 0.1, 2.0];
for &h in hs.iter() {
let scalar = CubicSplineKernel::new(h);
let avx = CubicSplineKernelAvxF32::new(h);
let n: usize = 1024;
let mut r0: f32 = 0.0;
let dr: f32 = (2.0 * h) / (n as f32);
for _chunk in 0..(n / 8) {
let rs = [
r0,
r0 + dr,
r0 + 2.0 * dr,
r0 + 3.0 * dr,
r0 + 4.0 * dr,
r0 + 5.0 * dr,
r0 + 6.0 * dr,
r0 + 7.0 * dr,
];
let r_vec = unsafe { _mm256_loadu_ps(rs.as_ptr()) };
let w_vec = unsafe { avx.evaluate(r_vec) };
let mut w_avx = [0.0f32; 8];
unsafe { _mm256_storeu_ps(w_avx.as_mut_ptr(), w_vec) };
for lane in 0..8 {
let r_lane = rs[lane];
let w_scalar = scalar.evaluate(r_lane);
let diff = (w_avx[lane] - w_scalar).abs();
let tol = 1e-6_f32.max(1e-5_f32 * w_scalar.abs());
assert!(
diff <= tol,
"AVX kernel mismatch (h={}, r={}, lane={}): avx={}, scalar={}, diff={}, tol={}",
h,
r_lane,
lane,
w_avx[lane],
w_scalar,
diff,
tol
);
}
r0 += 8.0 * dr;
}
for &r in &[h * 1.01, h * 1.5, h * 2.0, h * 2.5] {
let w_scalar = scalar.evaluate(r);
let w_avx = {
let v = unsafe { _mm256_set1_ps(r) };
let w = unsafe { avx.evaluate(v) };
let mut tmp = [0.0f32; 8];
unsafe { _mm256_storeu_ps(tmp.as_mut_ptr(), w) };
tmp[0]
};
let diff = (w_avx - w_scalar).abs();
let tol = 1e-6_f32.max(1e-5_f32 * w_scalar.abs());
assert!(
diff <= tol,
"AVX kernel mismatch outside support (h={}, r={}): avx={}, scalar={}, diff={}, tol={}",
h,
r,
w_avx,
w_scalar,
diff,
tol
);
}
}
}
}
pub struct DiscreteSquaredDistanceCubicKernel<R: Real> {
values: Vec<R>,
dr: R,
}
impl<R: Real> DiscreteSquaredDistanceCubicKernel<R> {
pub fn new<PR: Real>(n: usize, h: R) -> Self {
let mut values = Vec::with_capacity(n);
let compact_support: PR = h
.try_convert()
.expect("Compact support radius `h` has to fit into kernel pre-computation type `PR`");
let compact_support_squared = compact_support * compact_support;
let kernel = CubicSplineKernel::new(compact_support);
let dr = compact_support_squared
/ PR::from_usize(n)
.expect("Number of discrete kernel steps `n` has to fit into kernel pre-computation type `PR`");
for i in 0..n {
let i_and_half = PR::from_usize(i).unwrap() + PR::from_float(0.5);
let r_squared = dr * i_and_half;
let r = r_squared.sqrt();
let kernel_value = kernel.evaluate(r);
values.push(
kernel_value
.try_convert()
.expect("Kernel value has to fit into target type `R`"),
);
}
let dr = dr.try_convert().unwrap();
Self { values, dr }
}
#[inline(always)]
pub fn evaluate(&self, r_squared: R) -> R {
let normalized = (r_squared / self.dr).round();
let bin = normalized.to_usize().unwrap().min(self.values.len() - 1);
self.values[bin]
}
}
#[test]
fn test_discrete_kernel() {
let n = 10000;
let h = 0.025;
let discrete_kernel = DiscreteSquaredDistanceCubicKernel::new::<f64>(n, h);
let kernel = CubicSplineKernel::new(h);
let dr = h / (n as f64);
for i in 0..n {
let r = (i as f64) * dr;
let rr = r * r;
let discrete = discrete_kernel.evaluate(rr);
let continuous = kernel.evaluate(r);
let diff = (discrete - continuous).abs();
let rel_diff = diff / continuous;
if rel_diff > 5e-2 && diff > 1e-1 {
eprintln!(
"at r={}, r/h={}, discrete: {}, continuous: {}, diff: {}, rel_diff: {}",
r,
r / h,
discrete,
continuous,
diff,
rel_diff
);
assert!(false);
}
}
}