use approx::{AbsDiffEq, RelativeEq, UlpsEq};
use ndarray::Array1;
use wide::{CmpGt, CmpLt, f32x8};
use crate::DekeError;
#[inline(always)]
fn simd_load(slice: &[f32], off: usize) -> f32x8 {
let n = 8.min(slice.len().saturating_sub(off));
let mut buf = [0.0; 8];
buf[..n].copy_from_slice(&slice[off..off + n]);
f32x8::new(buf)
}
#[inline(always)]
fn simd_store(v: f32x8, dst: &mut [f32], off: usize) {
let n = 8.min(dst.len().saturating_sub(off));
dst[off..off + n].copy_from_slice(&v.to_array()[..n]);
}
#[inline(always)]
fn simd_binop<const N: usize>(
a: &[f32; N],
b: &[f32; N],
out: &mut [f32; N],
op: fn(f32x8, f32x8) -> f32x8,
) {
let mut off = 0;
while off < N {
simd_store(op(simd_load(a, off), simd_load(b, off)), out, off);
off += 8;
}
}
#[inline(always)]
fn simd_unaryop<const N: usize>(a: &[f32; N], out: &mut [f32; N], op: fn(f32x8) -> f32x8) {
let mut off = 0;
while off < N {
simd_store(op(simd_load(a, off)), out, off);
off += 8;
}
}
#[inline(always)]
fn simd_scalarop<const N: usize>(
a: &[f32; N],
s: f32x8,
out: &mut [f32; N],
op: fn(f32x8, f32x8) -> f32x8,
) {
let mut off = 0;
while off < N {
simd_store(op(simd_load(a, off), s), out, off);
off += 8;
}
}
#[inline(always)]
fn simd_hsum<const N: usize>(a: &[f32; N]) -> f32 {
let mut acc = f32x8::ZERO;
let mut off = 0;
while off < N {
acc += simd_load(a, off);
off += 8;
}
acc.reduce_add()
}
#[inline(always)]
fn simd_load_neg_inf(slice: &[f32], off: usize) -> f32x8 {
let n = 8.min(slice.len().saturating_sub(off));
let mut buf = [f32::NEG_INFINITY; 8];
buf[..n].copy_from_slice(&slice[off..off + n]);
f32x8::new(buf)
}
#[inline(always)]
fn simd_load_inf(slice: &[f32], off: usize) -> f32x8 {
let n = 8.min(slice.len().saturating_sub(off));
let mut buf = [f32::INFINITY; 8];
buf[..n].copy_from_slice(&slice[off..off + n]);
f32x8::new(buf)
}
#[inline(always)]
fn simd_dot<const N: usize>(a: &[f32; N], b: &[f32; N]) -> f32 {
let mut acc = f32x8::ZERO;
let mut off = 0;
while off < N {
acc = simd_load(a, off).mul_add(simd_load(b, off), acc);
off += 8;
}
acc.reduce_add()
}
pub type RobotQ = Array1<f32>;
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct SRobotQ<const N: usize>(pub [f32; N]);
impl<const N: usize> SRobotQ<N> {
pub const fn zeros() -> Self {
Self([0.0; N])
}
pub const fn from_array(arr: [f32; N]) -> Self {
Self(arr)
}
pub const fn as_slice(&self) -> &[f32] {
&self.0
}
pub const fn as_mut_slice(&mut self) -> &mut [f32] {
&mut self.0
}
pub fn to_robotq(&self) -> RobotQ {
RobotQ::from(self.0.to_vec())
}
pub fn force_from_robotq(q: &RobotQ) -> Self {
if let Ok(sq) = Self::try_from(q) {
sq
} else {
let slice = q.as_slice().unwrap_or(&[]);
let mut arr = [0.0; N];
for i in 0..N {
arr[i] = *slice.get(i).unwrap_or(&0.0);
}
Self(arr)
}
}
pub fn norm(&self) -> f32 {
if N <= 16 {
self.dot(self).sqrt()
} else {
self.0.iter().map(|x| x * x).sum::<f32>().sqrt()
}
}
pub fn dot(&self, other: &Self) -> f32 {
if N <= 16 {
simd_dot(&self.0, &other.0)
} else {
self.0.iter().zip(other.0.iter()).map(|(a, b)| a * b).sum()
}
}
pub fn map(&self, f: impl Fn(f32) -> f32) -> Self {
let mut out = [0.0; N];
for i in 0..N {
out[i] = f(self.0[i]);
}
Self(out)
}
pub fn sum(&self) -> f32 {
if N <= 16 {
simd_hsum(&self.0)
} else {
self.0.iter().sum()
}
}
pub fn splat(val: f32) -> Self {
Self([val; N])
}
pub fn from_fn(f: impl Fn(usize) -> f32) -> Self {
let mut out = [0.0; N];
for i in 0..N {
out[i] = f(i);
}
Self(out)
}
pub fn norm_squared(&self) -> f32 {
self.dot(self)
}
pub fn normalize(&self) -> Self {
let n = self.norm();
debug_assert!(n > 0.0, "cannot normalize zero-length SRobotQ");
*self / n
}
pub fn distance(&self, other: &Self) -> f32 {
(*self - *other).norm()
}
pub fn distance_squared(&self, other: &Self) -> f32 {
(*self - *other).norm_squared()
}
pub fn abs(&self) -> Self {
if N <= 16 {
let mut out = [0.0; N];
simd_unaryop(&self.0, &mut out, |a| a.abs());
Self(out)
} else {
self.map(f32::abs)
}
}
pub fn clamp(&self, min: &Self, max: &Self) -> Self {
if N <= 16 {
let mut out = [0.0; N];
let mut off = 0;
while off < N {
let v = simd_load(&self.0, off);
let lo = simd_load(&min.0, off);
let hi = simd_load(&max.0, off);
simd_store(v.fast_max(lo).fast_min(hi), &mut out, off);
off += 8;
}
Self(out)
} else {
let mut out = [0.0; N];
for i in 0..N {
out[i] = self.0[i].clamp(min.0[i], max.0[i]);
}
Self(out)
}
}
pub fn clamp_scalar(&self, min: f32, max: f32) -> Self {
if N <= 16 {
let mut out = [0.0; N];
let lo = f32x8::splat(min);
let hi = f32x8::splat(max);
let mut off = 0;
while off < N {
let v = simd_load(&self.0, off);
simd_store(v.fast_max(lo).fast_min(hi), &mut out, off);
off += 8;
}
Self(out)
} else {
self.map(|x| x.clamp(min, max))
}
}
pub fn max_element(&self) -> f32 {
if N <= 16 {
let mut acc = f32x8::splat(f32::NEG_INFINITY);
let mut off = 0;
while off < N {
acc = acc.fast_max(simd_load_neg_inf(&self.0, off));
off += 8;
}
let a = acc.to_array();
a[0].max(a[1])
.max(a[2].max(a[3]))
.max(a[4].max(a[5]).max(a[6].max(a[7])))
} else {
self.0.iter().copied().fold(f32::NEG_INFINITY, f32::max)
}
}
pub fn min_element(&self) -> f32 {
if N <= 16 {
let mut acc = f32x8::splat(f32::INFINITY);
let mut off = 0;
while off < N {
acc = acc.fast_min(simd_load_inf(&self.0, off));
off += 8;
}
let a = acc.to_array();
a[0].min(a[1])
.min(a[2].min(a[3]))
.min(a[4].min(a[5]).min(a[6].min(a[7])))
} else {
self.0.iter().copied().fold(f32::INFINITY, f32::min)
}
}
pub fn linf_norm(&self) -> f32 {
self.abs().max_element()
}
pub fn elementwise_mul(&self, other: &Self) -> Self {
let mut out = [0.0; N];
if N <= 16 {
simd_binop(&self.0, &other.0, &mut out, |a, b| a * b);
} else {
for i in 0..N {
out[i] = self.0[i] * other.0[i];
}
}
Self(out)
}
pub fn elementwise_div(&self, other: &Self) -> Self {
let mut out = [0.0; N];
if N <= 16 {
simd_binop(&self.0, &other.0, &mut out, |a, b| a / b);
} else {
for i in 0..N {
out[i] = self.0[i] / other.0[i];
}
}
Self(out)
}
pub fn zip_map(&self, other: &Self, f: impl Fn(f32, f32) -> f32) -> Self {
let mut out = [0.0; N];
for i in 0..N {
out[i] = f(self.0[i], other.0[i]);
}
Self(out)
}
pub fn sqrt(&self) -> Self {
if N <= 16 {
let mut out = [0.0; N];
simd_unaryop(&self.0, &mut out, |a| a.sqrt());
Self(out)
} else {
self.map(f32::sqrt)
}
}
pub fn mul_add(&self, mul: &Self, add: &Self) -> Self {
if N <= 16 {
let mut out = [0.0; N];
let mut off = 0;
while off < N {
let a = simd_load(&self.0, off);
let m = simd_load(&mul.0, off);
let d = simd_load(&add.0, off);
simd_store(a.mul_add(m, d), &mut out, off);
off += 8;
}
Self(out)
} else {
let mut out = [0.0; N];
for i in 0..N {
out[i] = self.0[i].mul_add(mul.0[i], add.0[i]);
}
Self(out)
}
}
pub fn any_non_finite(&self) -> bool {
let mut off = 0;
while off < N {
let v = simd_load(&self.0, off);
let bad = v.is_nan() | v.is_inf();
if (bad.to_bitmask() & Self::lane_mask(off)) != 0 {
return true;
}
off += 8;
}
false
}
pub fn any_gt(&self, other: &Self) -> bool {
let mut off = 0;
while off < N {
let a = simd_load(&self.0, off);
let b = simd_load(&other.0, off);
if (a.simd_gt(b).to_bitmask() & Self::lane_mask(off)) != 0 {
return true;
}
off += 8;
}
false
}
pub fn any_lt(&self, other: &Self) -> bool {
let mut off = 0;
while off < N {
let a = simd_load(&self.0, off);
let b = simd_load(&other.0, off);
if (a.simd_lt(b).to_bitmask() & Self::lane_mask(off)) != 0 {
return true;
}
off += 8;
}
false
}
#[inline(always)]
const fn lane_mask(off: usize) -> u32 {
let active = N.saturating_sub(off);
if active >= 8 {
0b11111111
} else {
(1 << active) - 1
}
}
pub fn is_close(&self, other: &Self, tol: f32) -> bool {
let diff = *self - *other;
diff.dot(&diff).sqrt() < tol
}
pub fn interpolate(&self, other: &Self, t: f32) -> Self {
*self + ((*other - *self) * t)
}
}
impl<const N: usize> std::ops::Index<usize> for SRobotQ<N> {
type Output = f32;
#[inline]
fn index(&self, i: usize) -> &f32 {
&self.0[i]
}
}
impl<const N: usize> std::ops::IndexMut<usize> for SRobotQ<N> {
#[inline]
fn index_mut(&mut self, i: usize) -> &mut f32 {
&mut self.0[i]
}
}
impl<const N: usize> std::ops::Add for SRobotQ<N> {
type Output = Self;
#[inline]
fn add(self, rhs: Self) -> Self {
let mut out = [0.0; N];
if N <= 16 {
simd_binop(&self.0, &rhs.0, &mut out, |a, b| a + b);
} else {
for i in 0..N {
out[i] = self.0[i] + rhs.0[i];
}
}
Self(out)
}
}
impl<const N: usize> std::ops::Sub for SRobotQ<N> {
type Output = Self;
#[inline]
fn sub(self, rhs: Self) -> Self {
let mut out = [0.0; N];
if N <= 16 {
simd_binop(&self.0, &rhs.0, &mut out, |a, b| a - b);
} else {
for i in 0..N {
out[i] = self.0[i] - rhs.0[i];
}
}
Self(out)
}
}
impl<const N: usize> std::ops::Neg for SRobotQ<N> {
type Output = Self;
#[inline]
fn neg(self) -> Self {
let mut out = [0.0; N];
if N <= 16 {
simd_unaryop(&self.0, &mut out, |a| f32x8::ZERO - a);
} else {
for i in 0..N {
out[i] = -self.0[i];
}
}
Self(out)
}
}
impl<const N: usize> std::ops::Mul<f32> for SRobotQ<N> {
type Output = Self;
#[inline]
fn mul(self, rhs: f32) -> Self {
let mut out = [0.0; N];
if N <= 16 {
simd_scalarop(&self.0, f32x8::splat(rhs), &mut out, |a, s| a * s);
} else {
for i in 0..N {
out[i] = self.0[i] * rhs;
}
}
Self(out)
}
}
impl<const N: usize> std::ops::Mul<SRobotQ<N>> for f32 {
type Output = SRobotQ<N>;
#[inline]
fn mul(self, rhs: SRobotQ<N>) -> SRobotQ<N> {
rhs * self
}
}
impl<const N: usize> std::ops::Div<f32> for SRobotQ<N> {
type Output = Self;
#[inline]
fn div(self, rhs: f32) -> Self {
let mut out = [0.0; N];
if N <= 16 {
simd_scalarop(&self.0, f32x8::splat(rhs), &mut out, |a, s| a / s);
} else {
for i in 0..N {
out[i] = self.0[i] / rhs;
}
}
Self(out)
}
}
impl<const N: usize> std::ops::AddAssign for SRobotQ<N> {
#[inline]
fn add_assign(&mut self, rhs: Self) {
if N <= 16 {
let mut out = [0.0; N];
simd_binop(&self.0, &rhs.0, &mut out, |a, b| a + b);
self.0 = out;
} else {
for i in 0..N {
self.0[i] += rhs.0[i];
}
}
}
}
impl<const N: usize> std::ops::SubAssign for SRobotQ<N> {
#[inline]
fn sub_assign(&mut self, rhs: Self) {
if N <= 16 {
let mut out = [0.0; N];
simd_binop(&self.0, &rhs.0, &mut out, |a, b| a - b);
self.0 = out;
} else {
for i in 0..N {
self.0[i] -= rhs.0[i];
}
}
}
}
impl<const N: usize> std::ops::MulAssign<f32> for SRobotQ<N> {
#[inline]
fn mul_assign(&mut self, rhs: f32) {
if N <= 16 {
let mut out = [0.0; N];
simd_scalarop(&self.0, f32x8::splat(rhs), &mut out, |a, s| a * s);
self.0 = out;
} else {
for i in 0..N {
self.0[i] *= rhs;
}
}
}
}
impl<const N: usize> std::ops::DivAssign<f32> for SRobotQ<N> {
#[inline]
fn div_assign(&mut self, rhs: f32) {
if N <= 16 {
let mut out = [0.0; N];
simd_scalarop(&self.0, f32x8::splat(rhs), &mut out, |a, s| a / s);
self.0 = out;
} else {
for i in 0..N {
self.0[i] /= rhs;
}
}
}
}
impl<const N: usize> std::ops::Add<SRobotQ<N>> for &RobotQ {
type Output = SRobotQ<N>;
#[inline]
fn add(self, rhs: SRobotQ<N>) -> SRobotQ<N> {
SRobotQ::<N>::force_from_robotq(self) + rhs
}
}
impl<const N: usize> std::ops::Sub<SRobotQ<N>> for &RobotQ {
type Output = SRobotQ<N>;
#[inline]
fn sub(self, rhs: SRobotQ<N>) -> SRobotQ<N> {
SRobotQ::<N>::force_from_robotq(self) - rhs
}
}
impl<const N: usize> Default for SRobotQ<N> {
#[inline]
fn default() -> Self {
Self::zeros()
}
}
impl<const N: usize> AsRef<[f32; N]> for SRobotQ<N> {
#[inline]
fn as_ref(&self) -> &[f32; N] {
&self.0
}
}
impl<const N: usize> AsMut<[f32; N]> for SRobotQ<N> {
#[inline]
fn as_mut(&mut self) -> &mut [f32; N] {
&mut self.0
}
}
impl<const N: usize> AsRef<[f32]> for SRobotQ<N> {
#[inline]
fn as_ref(&self) -> &[f32] {
&self.0
}
}
impl<const N: usize> AsMut<[f32]> for SRobotQ<N> {
#[inline]
fn as_mut(&mut self) -> &mut [f32] {
&mut self.0
}
}
impl<const N: usize> From<[f32; N]> for SRobotQ<N> {
#[inline]
fn from(arr: [f32; N]) -> Self {
Self(arr)
}
}
impl<const N: usize> From<&[f32; N]> for SRobotQ<N> {
#[inline]
fn from(arr: &[f32; N]) -> Self {
Self(*arr)
}
}
impl<const N: usize> From<[f64; N]> for SRobotQ<N> {
#[inline]
fn from(arr: [f64; N]) -> Self {
let mut out = [0.0f32; N];
let mut i = 0;
while i < N {
out[i] = arr[i] as f32;
i += 1;
}
Self(out)
}
}
impl<const N: usize> From<&[f64; N]> for SRobotQ<N> {
#[inline]
fn from(arr: &[f64; N]) -> Self {
Self::from(*arr)
}
}
impl<const N: usize> From<SRobotQ<N>> for [f32; N] {
#[inline]
fn from(q: SRobotQ<N>) -> [f32; N] {
q.0
}
}
impl<const N: usize> From<SRobotQ<N>> for Vec<f32> {
#[inline]
fn from(q: SRobotQ<N>) -> Vec<f32> {
q.0.to_vec()
}
}
impl<const N: usize> From<SRobotQ<N>> for RobotQ {
#[inline]
fn from(q: SRobotQ<N>) -> RobotQ {
q.to_robotq()
}
}
impl<const N: usize> TryFrom<&SRobotQ<N>> for SRobotQ<N> {
type Error = DekeError;
#[inline]
fn try_from(q: &SRobotQ<N>) -> Result<Self, Self::Error> {
Ok(*q)
}
}
impl<const N: usize> TryFrom<&[f32]> for SRobotQ<N> {
type Error = DekeError;
#[inline]
fn try_from(slice: &[f32]) -> Result<Self, Self::Error> {
if slice.len() != N {
return Err(DekeError::ShapeMismatch {
expected: N,
found: slice.len(),
});
}
let mut arr = [0.0; N];
arr.copy_from_slice(slice);
Ok(Self(arr))
}
}
impl<const N: usize> TryFrom<Vec<f32>> for SRobotQ<N> {
type Error = DekeError;
#[inline]
fn try_from(v: Vec<f32>) -> Result<Self, Self::Error> {
Self::try_from(v.as_slice())
}
}
impl<const N: usize> TryFrom<&Vec<f32>> for SRobotQ<N> {
type Error = DekeError;
#[inline]
fn try_from(v: &Vec<f32>) -> Result<Self, Self::Error> {
Self::try_from(v.as_slice())
}
}
impl<const N: usize> TryFrom<&[f64]> for SRobotQ<N> {
type Error = DekeError;
#[inline]
fn try_from(slice: &[f64]) -> Result<Self, Self::Error> {
if slice.len() != N {
return Err(DekeError::ShapeMismatch {
expected: N,
found: slice.len(),
});
}
let mut arr = [0.0f32; N];
let mut i = 0;
while i < N {
arr[i] = slice[i] as f32;
i += 1;
}
Ok(Self(arr))
}
}
impl<const N: usize> TryFrom<Vec<f64>> for SRobotQ<N> {
type Error = DekeError;
#[inline]
fn try_from(v: Vec<f64>) -> Result<Self, Self::Error> {
Self::try_from(v.as_slice())
}
}
impl<const N: usize> TryFrom<&Vec<f64>> for SRobotQ<N> {
type Error = DekeError;
#[inline]
fn try_from(v: &Vec<f64>) -> Result<Self, Self::Error> {
Self::try_from(v.as_slice())
}
}
impl<const N: usize> TryFrom<&RobotQ> for SRobotQ<N> {
type Error = DekeError;
#[inline]
fn try_from(q: &RobotQ) -> Result<Self, Self::Error> {
let slice = q.as_slice().unwrap_or(&[]);
if slice.len() != N {
return Err(DekeError::ShapeMismatch {
expected: N,
found: slice.len(),
});
}
let mut arr = [0.0; N];
arr.copy_from_slice(slice);
Ok(Self(arr))
}
}
impl<const N: usize> AbsDiffEq for SRobotQ<N> {
type Epsilon = f32;
fn default_epsilon() -> f32 {
f32::default_epsilon()
}
fn abs_diff_eq(&self, other: &Self, epsilon: f32) -> bool {
self.0
.iter()
.zip(other.0.iter())
.all(|(a, b)| a.abs_diff_eq(b, epsilon))
}
}
impl<const N: usize> RelativeEq for SRobotQ<N> {
fn default_max_relative() -> f32 {
f32::default_max_relative()
}
fn relative_eq(&self, other: &Self, epsilon: f32, max_relative: f32) -> bool {
self.0
.iter()
.zip(other.0.iter())
.all(|(a, b)| a.relative_eq(b, epsilon, max_relative))
}
}
impl<const N: usize> UlpsEq for SRobotQ<N> {
fn default_max_ulps() -> u32 {
f32::default_max_ulps()
}
fn ulps_eq(&self, other: &Self, epsilon: f32, max_ulps: u32) -> bool {
self.0
.iter()
.zip(other.0.iter())
.all(|(a, b)| a.ulps_eq(b, epsilon, max_ulps))
}
}
impl<const N: usize> TryFrom<RobotQ> for SRobotQ<N> {
type Error = DekeError;
#[inline]
fn try_from(q: RobotQ) -> Result<Self, Self::Error> {
let slice = q.as_slice().unwrap_or(&[]);
if slice.len() != N {
return Err(DekeError::ShapeMismatch {
expected: N,
found: slice.len(),
});
}
let mut arr = [0.0; N];
arr.copy_from_slice(slice);
Ok(Self(arr))
}
}
impl<const N: usize> From<&SRobotQ<N>> for RobotQ {
#[inline]
fn from(sq: &SRobotQ<N>) -> RobotQ {
sq.to_robotq()
}
}
pub fn robotq<T: Into<f64>>(vals: impl IntoIterator<Item = T>) -> RobotQ {
vals.into_iter().map(|v| v.into() as f32).collect()
}