#![cfg(feature = "avx_luts")]
use crate::conversions::interpolator::BarycentricWeight;
use crate::math::FusedMultiplyAdd;
use std::arch::x86_64::*;
use std::ops::{Add, Mul, Sub};
#[repr(align(8), C)]
pub(crate) struct AvxAlignedI16(pub(crate) [i16; 4]);
#[cfg(feature = "options")]
pub(crate) struct TetrahedralAvxQ0_15<const GRID_SIZE: usize> {}
#[cfg(feature = "options")]
pub(crate) struct PyramidalAvxQ0_15<const GRID_SIZE: usize> {}
#[cfg(feature = "options")]
pub(crate) struct PrismaticAvxQ0_15<const GRID_SIZE: usize> {}
pub(crate) struct TrilinearAvxQ0_15<const GRID_SIZE: usize> {}
#[cfg(feature = "options")]
pub(crate) struct PrismaticAvxQ0_15Double<const GRID_SIZE: usize> {}
pub(crate) struct TrilinearAvxQ0_15Double<const GRID_SIZE: usize> {}
#[cfg(feature = "options")]
pub(crate) struct PyramidAvxFmaQ0_15Double<const GRID_SIZE: usize> {}
#[cfg(feature = "options")]
pub(crate) struct TetrahedralAvxQ0_15Double<const GRID_SIZE: usize> {}
pub(crate) trait AvxMdInterpolationQ0_15Double {
fn inter3_sse(
&self,
table0: &[AvxAlignedI16],
table1: &[AvxAlignedI16],
in_r: usize,
in_g: usize,
in_b: usize,
lut: &[BarycentricWeight<i16>],
) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse);
}
pub(crate) trait AvxMdInterpolationQ0_15 {
fn inter3_sse(
&self,
table: &[AvxAlignedI16],
in_r: usize,
in_g: usize,
in_b: usize,
lut: &[BarycentricWeight<i16>],
) -> AvxVectorQ0_15Sse;
}
trait Fetcher<T> {
fn fetch(&self, x: i32, y: i32, z: i32) -> T;
}
#[derive(Copy, Clone)]
#[repr(transparent)]
pub(crate) struct AvxVectorQ0_15Sse {
pub(crate) v: __m128i,
}
#[derive(Copy, Clone)]
#[repr(transparent)]
pub(crate) struct AvxVectorQ0_15 {
pub(crate) v: __m256i,
}
impl AvxVectorQ0_15 {
#[inline(always)]
pub(crate) fn from_sse(lo: AvxVectorQ0_15Sse, hi: AvxVectorQ0_15Sse) -> AvxVectorQ0_15 {
unsafe {
AvxVectorQ0_15 {
v: _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(lo.v), hi.v),
}
}
}
#[inline(always)]
pub(crate) fn split(self) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
unsafe {
(
AvxVectorQ0_15Sse {
v: _mm256_castsi256_si128(self.v),
},
AvxVectorQ0_15Sse {
v: _mm256_extracti128_si256::<1>(self.v),
},
)
}
}
}
impl From<i16> for AvxVectorQ0_15Sse {
#[inline(always)]
fn from(v: i16) -> Self {
AvxVectorQ0_15Sse {
v: unsafe { _mm_set1_epi16(v) },
}
}
}
impl From<i16> for AvxVectorQ0_15 {
#[inline(always)]
fn from(v: i16) -> Self {
AvxVectorQ0_15 {
v: unsafe { _mm256_set1_epi16(v) },
}
}
}
impl Sub<AvxVectorQ0_15Sse> for AvxVectorQ0_15Sse {
type Output = Self;
#[inline(always)]
fn sub(self, rhs: AvxVectorQ0_15Sse) -> Self::Output {
AvxVectorQ0_15Sse {
v: unsafe { _mm_sub_epi16(self.v, rhs.v) },
}
}
}
impl Sub<AvxVectorQ0_15> for AvxVectorQ0_15 {
type Output = Self;
#[inline(always)]
fn sub(self, rhs: AvxVectorQ0_15) -> Self::Output {
AvxVectorQ0_15 {
v: unsafe { _mm256_sub_epi16(self.v, rhs.v) },
}
}
}
impl Add<AvxVectorQ0_15Sse> for AvxVectorQ0_15Sse {
type Output = Self;
#[inline(always)]
fn add(self, rhs: AvxVectorQ0_15Sse) -> Self::Output {
AvxVectorQ0_15Sse {
v: unsafe { _mm_add_epi16(self.v, rhs.v) },
}
}
}
impl Mul<AvxVectorQ0_15Sse> for AvxVectorQ0_15Sse {
type Output = Self;
#[inline(always)]
fn mul(self, rhs: AvxVectorQ0_15Sse) -> Self::Output {
AvxVectorQ0_15Sse {
v: unsafe { _mm_mulhrs_epi16(self.v, rhs.v) },
}
}
}
impl Add<AvxVectorQ0_15> for AvxVectorQ0_15 {
type Output = Self;
#[inline(always)]
fn add(self, rhs: AvxVectorQ0_15) -> Self::Output {
AvxVectorQ0_15 {
v: unsafe { _mm256_add_epi16(self.v, rhs.v) },
}
}
}
impl Mul<AvxVectorQ0_15> for AvxVectorQ0_15 {
type Output = Self;
#[inline(always)]
fn mul(self, rhs: AvxVectorQ0_15) -> Self::Output {
AvxVectorQ0_15 {
v: unsafe { _mm256_mulhrs_epi16(self.v, rhs.v) },
}
}
}
impl FusedMultiplyAdd<AvxVectorQ0_15Sse> for AvxVectorQ0_15Sse {
#[inline(always)]
fn mla(&self, b: AvxVectorQ0_15Sse, c: AvxVectorQ0_15Sse) -> AvxVectorQ0_15Sse {
AvxVectorQ0_15Sse {
v: unsafe { _mm_add_epi16(_mm_mulhrs_epi16(b.v, c.v), self.v) },
}
}
}
impl FusedMultiplyAdd<AvxVectorQ0_15> for AvxVectorQ0_15 {
#[inline(always)]
fn mla(&self, b: AvxVectorQ0_15, c: AvxVectorQ0_15) -> AvxVectorQ0_15 {
AvxVectorQ0_15 {
v: unsafe { _mm256_add_epi16(_mm256_mulhrs_epi16(b.v, c.v), self.v) },
}
}
}
struct TetrahedralAvxSseFetchVector<'a, const GRID_SIZE: usize> {
cube: &'a [AvxAlignedI16],
}
struct TetrahedralAvxFetchVector<'a, const GRID_SIZE: usize> {
cube0: &'a [AvxAlignedI16],
cube1: &'a [AvxAlignedI16],
}
impl<const GRID_SIZE: usize> Fetcher<AvxVectorQ0_15> for TetrahedralAvxFetchVector<'_, GRID_SIZE> {
#[inline(always)]
fn fetch(&self, x: i32, y: i32, z: i32) -> AvxVectorQ0_15 {
let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
+ y as u32 * GRID_SIZE as u32
+ z as u32) as usize;
let jx0 = unsafe { self.cube0.get_unchecked(offset..) };
let jx1 = unsafe { self.cube1.get_unchecked(offset..) };
AvxVectorQ0_15 {
v: unsafe {
_mm256_inserti128_si256::<1>(
_mm256_castsi128_si256(_mm_loadu_si64(jx0.as_ptr() as *const _)),
_mm_loadu_si64(jx1.as_ptr() as *const _),
)
},
}
}
}
impl<const GRID_SIZE: usize> Fetcher<AvxVectorQ0_15Sse>
for TetrahedralAvxSseFetchVector<'_, GRID_SIZE>
{
#[inline(always)]
fn fetch(&self, x: i32, y: i32, z: i32) -> AvxVectorQ0_15Sse {
let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
+ y as u32 * GRID_SIZE as u32
+ z as u32) as usize;
let jx = unsafe { self.cube.get_unchecked(offset..) };
AvxVectorQ0_15Sse {
v: unsafe { _mm_loadu_si64(jx.as_ptr() as *const _) },
}
}
}
#[cfg(feature = "options")]
impl<const GRID_SIZE: usize> TetrahedralAvxQ0_15<GRID_SIZE> {
#[target_feature(enable = "avx2")]
unsafe fn interpolate(
&self,
in_r: usize,
in_g: usize,
in_b: usize,
lut: &[BarycentricWeight<i16>],
r: impl Fetcher<AvxVectorQ0_15Sse>,
) -> AvxVectorQ0_15Sse {
let lut_r = unsafe { *lut.get_unchecked(in_r) };
let lut_g = unsafe { *lut.get_unchecked(in_g) };
let lut_b = unsafe { *lut.get_unchecked(in_b) };
let x: i32 = lut_r.x;
let y: i32 = lut_g.x;
let z: i32 = lut_b.x;
let x_n: i32 = lut_r.x_n;
let y_n: i32 = lut_g.x_n;
let z_n: i32 = lut_b.x_n;
let rx = lut_r.w;
let ry = lut_g.w;
let rz = lut_b.w;
let c0 = r.fetch(x, y, z);
let c2;
let c1;
let c3;
if rx >= ry {
if ry >= rz {
c1 = r.fetch(x_n, y, z) - c0;
c2 = r.fetch(x_n, y_n, z) - r.fetch(x_n, y, z);
c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
} else if rx >= rz {
c1 = r.fetch(x_n, y, z) - c0;
c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
c3 = r.fetch(x_n, y, z_n) - r.fetch(x_n, y, z);
} else {
c1 = r.fetch(x_n, y, z_n) - r.fetch(x, y, z_n);
c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
c3 = r.fetch(x, y, z_n) - c0;
}
} else if rx >= rz {
c1 = r.fetch(x_n, y_n, z) - r.fetch(x, y_n, z);
c2 = r.fetch(x, y_n, z) - c0;
c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
} else if ry >= rz {
c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
c2 = r.fetch(x, y_n, z) - c0;
c3 = r.fetch(x, y_n, z_n) - r.fetch(x, y_n, z);
} else {
c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
c2 = r.fetch(x, y_n, z_n) - r.fetch(x, y, z_n);
c3 = r.fetch(x, y, z_n) - c0;
}
let s0 = c0.mla(c1, AvxVectorQ0_15Sse::from(rx));
let s1 = s0.mla(c2, AvxVectorQ0_15Sse::from(ry));
s1.mla(c3, AvxVectorQ0_15Sse::from(rz))
}
}
macro_rules! define_interp_avx {
($interpolator: ident) => {
impl<const GRID_SIZE: usize> AvxMdInterpolationQ0_15 for $interpolator<GRID_SIZE> {
fn inter3_sse(
&self,
table: &[AvxAlignedI16],
in_r: usize,
in_g: usize,
in_b: usize,
lut: &[BarycentricWeight<i16>],
) -> AvxVectorQ0_15Sse {
unsafe {
self.interpolate(
in_r,
in_g,
in_b,
lut,
TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: table },
)
}
}
}
};
}
#[cfg(feature = "options")]
macro_rules! define_interp_avx_d {
($interpolator: ident) => {
impl<const GRID_SIZE: usize> AvxMdInterpolationQ0_15Double for $interpolator<GRID_SIZE> {
fn inter3_sse(
&self,
table0: &[AvxAlignedI16],
table1: &[AvxAlignedI16],
in_r: usize,
in_g: usize,
in_b: usize,
lut: &[BarycentricWeight<i16>],
) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
unsafe {
self.interpolate(
in_r,
in_g,
in_b,
lut,
TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: table0 },
TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: table1 },
)
}
}
}
};
}
#[cfg(feature = "options")]
define_interp_avx!(TetrahedralAvxQ0_15);
#[cfg(feature = "options")]
define_interp_avx!(PyramidalAvxQ0_15);
#[cfg(feature = "options")]
define_interp_avx!(PrismaticAvxQ0_15);
define_interp_avx!(TrilinearAvxQ0_15);
#[cfg(feature = "options")]
define_interp_avx_d!(PrismaticAvxQ0_15Double);
#[cfg(feature = "options")]
define_interp_avx_d!(PyramidAvxFmaQ0_15Double);
#[cfg(feature = "options")]
impl<const GRID_SIZE: usize> AvxMdInterpolationQ0_15Double
for TetrahedralAvxQ0_15Double<GRID_SIZE>
{
fn inter3_sse(
&self,
table0: &[AvxAlignedI16],
table1: &[AvxAlignedI16],
in_r: usize,
in_g: usize,
in_b: usize,
lut: &[BarycentricWeight<i16>],
) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
unsafe {
self.interpolate(
in_r,
in_g,
in_b,
lut,
TetrahedralAvxFetchVector::<GRID_SIZE> {
cube0: table0,
cube1: table1,
},
)
}
}
}
impl<const GRID_SIZE: usize> AvxMdInterpolationQ0_15Double for TrilinearAvxQ0_15Double<GRID_SIZE> {
fn inter3_sse(
&self,
table0: &[AvxAlignedI16],
table1: &[AvxAlignedI16],
in_r: usize,
in_g: usize,
in_b: usize,
lut: &[BarycentricWeight<i16>],
) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
unsafe {
self.interpolate(
in_r,
in_g,
in_b,
lut,
TetrahedralAvxFetchVector::<GRID_SIZE> {
cube0: table0,
cube1: table1,
},
)
}
}
}
#[cfg(feature = "options")]
impl<const GRID_SIZE: usize> PyramidalAvxQ0_15<GRID_SIZE> {
#[target_feature(enable = "avx2")]
unsafe fn interpolate(
&self,
in_r: usize,
in_g: usize,
in_b: usize,
lut: &[BarycentricWeight<i16>],
r: impl Fetcher<AvxVectorQ0_15Sse>,
) -> AvxVectorQ0_15Sse {
let lut_r = unsafe { *lut.get_unchecked(in_r) };
let lut_g = unsafe { *lut.get_unchecked(in_g) };
let lut_b = unsafe { *lut.get_unchecked(in_b) };
let x: i32 = lut_r.x;
let y: i32 = lut_g.x;
let z: i32 = lut_b.x;
let x_n: i32 = lut_r.x_n;
let y_n: i32 = lut_g.x_n;
let z_n: i32 = lut_b.x_n;
let dr = lut_r.w;
let dg = lut_g.w;
let db = lut_b.w;
let c0 = r.fetch(x, y, z);
let w0 = AvxVectorQ0_15Sse::from(db);
let w1 = AvxVectorQ0_15Sse::from(dr);
let w2 = AvxVectorQ0_15Sse::from(dg);
if dr > db && dg > db {
let w3 = AvxVectorQ0_15Sse::from(dr) * AvxVectorQ0_15Sse::from(dg);
let x0 = r.fetch(x_n, y_n, z_n);
let x1 = r.fetch(x_n, y_n, z);
let x2 = r.fetch(x_n, y, z);
let x3 = r.fetch(x, y_n, z);
let c1 = x0 - x1;
let c2 = x2 - c0;
let c3 = x3 - c0;
let c4 = c0 - x3 - x2 + x1;
let s0 = c0.mla(c1, w0);
let s1 = s0.mla(c2, w1);
let s2 = s1.mla(c3, w2);
s2.mla(c4, w3)
} else if db > dr && dg > dr {
let w3 = AvxVectorQ0_15Sse::from(dg) * AvxVectorQ0_15Sse::from(db);
let x0 = r.fetch(x, y, z_n);
let x1 = r.fetch(x_n, y_n, z_n);
let x2 = r.fetch(x, y_n, z_n);
let x3 = r.fetch(x, y_n, z);
let c1 = x0 - c0;
let c2 = x1 - x2;
let c3 = x3 - c0;
let c4 = c0 - x3 - x0 + x2;
let s0 = c0.mla(c1, w0);
let s1 = s0.mla(c2, w1);
let s2 = s1.mla(c3, w2);
s2.mla(c4, w3)
} else {
let w3 = AvxVectorQ0_15Sse::from(db) * AvxVectorQ0_15Sse::from(dr);
let x0 = r.fetch(x, y, z_n);
let x1 = r.fetch(x_n, y, z);
let x2 = r.fetch(x_n, y, z_n);
let x3 = r.fetch(x_n, y_n, z_n);
let c1 = x0 - c0;
let c2 = x1 - c0;
let c3 = x3 - x2;
let c4 = c0 - x1 - x0 + x2;
let s0 = c0.mla(c1, w0);
let s1 = s0.mla(c2, w1);
let s2 = s1.mla(c3, w2);
s2.mla(c4, w3)
}
}
}
#[cfg(feature = "options")]
impl<const GRID_SIZE: usize> PrismaticAvxQ0_15<GRID_SIZE> {
#[target_feature(enable = "avx2")]
unsafe fn interpolate(
&self,
in_r: usize,
in_g: usize,
in_b: usize,
lut: &[BarycentricWeight<i16>],
r: impl Fetcher<AvxVectorQ0_15Sse>,
) -> AvxVectorQ0_15Sse {
let lut_r = unsafe { *lut.get_unchecked(in_r) };
let lut_g = unsafe { *lut.get_unchecked(in_g) };
let lut_b = unsafe { *lut.get_unchecked(in_b) };
let x: i32 = lut_r.x;
let y: i32 = lut_g.x;
let z: i32 = lut_b.x;
let x_n: i32 = lut_r.x_n;
let y_n: i32 = lut_g.x_n;
let z_n: i32 = lut_b.x_n;
let dr = lut_r.w;
let dg = lut_g.w;
let db = lut_b.w;
let c0 = r.fetch(x, y, z);
let w0 = AvxVectorQ0_15Sse::from(db);
let w1 = AvxVectorQ0_15Sse::from(dr);
let w2 = AvxVectorQ0_15Sse::from(dg);
let w3 = AvxVectorQ0_15Sse::from(dg) * AvxVectorQ0_15Sse::from(db);
let w4 = AvxVectorQ0_15Sse::from(dr) * AvxVectorQ0_15Sse::from(dg);
if db > dr {
let x0 = r.fetch(x, y, z_n);
let x1 = r.fetch(x_n, y, z_n);
let x2 = r.fetch(x, y_n, z);
let x3 = r.fetch(x, y_n, z_n);
let x4 = r.fetch(x_n, y_n, z_n);
let c1 = x0 - c0;
let c2 = x1 - x0;
let c3 = x2 - c0;
let c4 = c0 - x2 - x0 + x3;
let c5 = x0 - x3 - x1 + x4;
let s0 = c0.mla(c1, w0);
let s1 = s0.mla(c2, w1);
let s2 = s1.mla(c3, w2);
let s3 = s2.mla(c4, w3);
s3.mla(c5, w4)
} else {
let x0 = r.fetch(x_n, y, z);
let x1 = r.fetch(x_n, y, z_n);
let x2 = r.fetch(x, y_n, z);
let x3 = r.fetch(x_n, y_n, z);
let x4 = r.fetch(x_n, y_n, z_n);
let c1 = x1 - x0;
let c2 = x0 - c0;
let c3 = x2 - c0;
let c4 = x0 - x3 - x1 + x4;
let c5 = c0 - x2 - x0 + x3;
let s0 = c0.mla(c1, w0);
let s1 = s0.mla(c2, w1);
let s2 = s1.mla(c3, w2);
let s3 = s2.mla(c4, w3);
s3.mla(c5, w4)
}
}
}
#[cfg(feature = "options")]
impl<const GRID_SIZE: usize> PrismaticAvxQ0_15Double<GRID_SIZE> {
#[target_feature(enable = "avx2")]
unsafe fn interpolate(
&self,
in_r: usize,
in_g: usize,
in_b: usize,
lut: &[BarycentricWeight<i16>],
r0: impl Fetcher<AvxVectorQ0_15Sse>,
r1: impl Fetcher<AvxVectorQ0_15Sse>,
) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
let lut_r = unsafe { *lut.get_unchecked(in_r) };
let lut_g = unsafe { *lut.get_unchecked(in_g) };
let lut_b = unsafe { *lut.get_unchecked(in_b) };
let x: i32 = lut_r.x;
let y: i32 = lut_g.x;
let z: i32 = lut_b.x;
let x_n: i32 = lut_r.x_n;
let y_n: i32 = lut_g.x_n;
let z_n: i32 = lut_b.x_n;
let dr = lut_r.w;
let dg = lut_g.w;
let db = lut_b.w;
let c0_0 = r0.fetch(x, y, z);
let c0_1 = r0.fetch(x, y, z);
let w0 = AvxVectorQ0_15::from(db);
let w1 = AvxVectorQ0_15::from(dr);
let w2 = AvxVectorQ0_15::from(dg);
let w3 = AvxVectorQ0_15::from(dg) * AvxVectorQ0_15::from(db);
let w4 = AvxVectorQ0_15::from(dr) * AvxVectorQ0_15::from(dg);
let c0 = AvxVectorQ0_15::from_sse(c0_0, c0_1);
if db > dr {
let x0_0 = r0.fetch(x, y, z_n);
let x1_0 = r0.fetch(x_n, y, z_n);
let x2_0 = r0.fetch(x, y_n, z);
let x3_0 = r0.fetch(x, y_n, z_n);
let x4_0 = r0.fetch(x_n, y_n, z_n);
let x0_1 = r1.fetch(x, y, z_n);
let x1_1 = r1.fetch(x_n, y, z_n);
let x2_1 = r1.fetch(x, y_n, z);
let x3_1 = r1.fetch(x, y_n, z_n);
let x4_1 = r1.fetch(x_n, y_n, z_n);
let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
let x4 = AvxVectorQ0_15::from_sse(x4_0, x4_1);
let c1 = x0 - c0;
let c2 = x1 - x0;
let c3 = x2 - c0;
let c4 = c0 - x2 - x0 + x3;
let c5 = x0 - x3 - x1 + x4;
let s0 = c0.mla(c1, w0);
let s1 = s0.mla(c2, w1);
let s2 = s1.mla(c3, w2);
let s3 = s2.mla(c4, w3);
s3.mla(c5, w4).split()
} else {
let x0_0 = r0.fetch(x_n, y, z);
let x1_0 = r0.fetch(x_n, y, z_n);
let x2_0 = r0.fetch(x, y_n, z);
let x3_0 = r0.fetch(x_n, y_n, z);
let x4_0 = r0.fetch(x_n, y_n, z_n);
let x0_1 = r1.fetch(x_n, y, z);
let x1_1 = r1.fetch(x_n, y, z_n);
let x2_1 = r1.fetch(x, y_n, z);
let x3_1 = r1.fetch(x_n, y_n, z);
let x4_1 = r1.fetch(x_n, y_n, z_n);
let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
let x4 = AvxVectorQ0_15::from_sse(x4_0, x4_1);
let c1 = x1 - x0;
let c2 = x0 - c0;
let c3 = x2 - c0;
let c4 = x0 - x3 - x1 + x4;
let c5 = c0 - x2 - x0 + x3;
let s0 = c0.mla(c1, w0);
let s1 = s0.mla(c2, w1);
let s2 = s1.mla(c3, w2);
let s3 = s2.mla(c4, w3);
s3.mla(c5, w4).split()
}
}
}
#[cfg(feature = "options")]
impl<const GRID_SIZE: usize> PyramidAvxFmaQ0_15Double<GRID_SIZE> {
#[target_feature(enable = "avx2")]
unsafe fn interpolate(
&self,
in_r: usize,
in_g: usize,
in_b: usize,
lut: &[BarycentricWeight<i16>],
r0: impl Fetcher<AvxVectorQ0_15Sse>,
r1: impl Fetcher<AvxVectorQ0_15Sse>,
) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
let lut_r = unsafe { *lut.get_unchecked(in_r) };
let lut_g = unsafe { *lut.get_unchecked(in_g) };
let lut_b = unsafe { *lut.get_unchecked(in_b) };
let x: i32 = lut_r.x;
let y: i32 = lut_g.x;
let z: i32 = lut_b.x;
let x_n: i32 = lut_r.x_n;
let y_n: i32 = lut_g.x_n;
let z_n: i32 = lut_b.x_n;
let dr = lut_r.w;
let dg = lut_g.w;
let db = lut_b.w;
let c0_0 = r0.fetch(x, y, z);
let c0_1 = r1.fetch(x, y, z);
let w0 = AvxVectorQ0_15::from(db);
let w1 = AvxVectorQ0_15::from(dr);
let w2 = AvxVectorQ0_15::from(dg);
let c0 = AvxVectorQ0_15::from_sse(c0_0, c0_1);
if dr > db && dg > db {
let w3 = AvxVectorQ0_15::from(dr) * AvxVectorQ0_15::from(dg);
let x0_0 = r0.fetch(x_n, y_n, z_n);
let x1_0 = r0.fetch(x_n, y_n, z);
let x2_0 = r0.fetch(x_n, y, z);
let x3_0 = r0.fetch(x, y_n, z);
let x0_1 = r1.fetch(x_n, y_n, z_n);
let x1_1 = r1.fetch(x_n, y_n, z);
let x2_1 = r1.fetch(x_n, y, z);
let x3_1 = r1.fetch(x, y_n, z);
let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
let c1 = x0 - x1;
let c2 = x2 - c0;
let c3 = x3 - c0;
let c4 = c0 - x3 - x2 + x1;
let s0 = c0.mla(c1, w0);
let s1 = s0.mla(c2, w1);
let s2 = s1.mla(c3, w2);
s2.mla(c4, w3).split()
} else if db > dr && dg > dr {
let w3 = AvxVectorQ0_15::from(dg) * AvxVectorQ0_15::from(db);
let x0_0 = r0.fetch(x, y, z_n);
let x1_0 = r0.fetch(x_n, y_n, z_n);
let x2_0 = r0.fetch(x, y_n, z_n);
let x3_0 = r0.fetch(x, y_n, z);
let x0_1 = r1.fetch(x, y, z_n);
let x1_1 = r1.fetch(x_n, y_n, z_n);
let x2_1 = r1.fetch(x, y_n, z_n);
let x3_1 = r1.fetch(x, y_n, z);
let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
let c1 = x0 - c0;
let c2 = x1 - x2;
let c3 = x3 - c0;
let c4 = c0 - x3 - x0 + x2;
let s0 = c0.mla(c1, w0);
let s1 = s0.mla(c2, w1);
let s2 = s1.mla(c3, w2);
s2.mla(c4, w3).split()
} else {
let w3 = AvxVectorQ0_15::from(db) * AvxVectorQ0_15::from(dr);
let x0_0 = r0.fetch(x, y, z_n);
let x1_0 = r0.fetch(x_n, y, z);
let x2_0 = r0.fetch(x_n, y, z_n);
let x3_0 = r0.fetch(x_n, y_n, z_n);
let x0_1 = r1.fetch(x, y, z_n);
let x1_1 = r1.fetch(x_n, y, z);
let x2_1 = r1.fetch(x_n, y, z_n);
let x3_1 = r1.fetch(x_n, y_n, z_n);
let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
let c1 = x0 - c0;
let c2 = x1 - c0;
let c3 = x3 - x2;
let c4 = c0 - x1 - x0 + x2;
let s0 = c0.mla(c1, w0);
let s1 = s0.mla(c2, w1);
let s2 = s1.mla(c3, w2);
s2.mla(c4, w3).split()
}
}
}
#[cfg(feature = "options")]
impl<const GRID_SIZE: usize> TetrahedralAvxQ0_15Double<GRID_SIZE> {
#[target_feature(enable = "avx2")]
unsafe fn interpolate(
&self,
in_r: usize,
in_g: usize,
in_b: usize,
lut: &[BarycentricWeight<i16>],
rv: impl Fetcher<AvxVectorQ0_15>,
) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
let lut_r = unsafe { *lut.get_unchecked(in_r) };
let lut_g = unsafe { *lut.get_unchecked(in_g) };
let lut_b = unsafe { *lut.get_unchecked(in_b) };
let x: i32 = lut_r.x;
let y: i32 = lut_g.x;
let z: i32 = lut_b.x;
let x_n: i32 = lut_r.x_n;
let y_n: i32 = lut_g.x_n;
let z_n: i32 = lut_b.x_n;
let rx = lut_r.w;
let ry = lut_g.w;
let rz = lut_b.w;
let c0 = rv.fetch(x, y, z);
let w0 = AvxVectorQ0_15::from(rx);
let w1 = AvxVectorQ0_15::from(ry);
let w2 = AvxVectorQ0_15::from(rz);
let c2;
let c1;
let c3;
if rx >= ry {
if ry >= rz {
c1 = rv.fetch(x_n, y, z) - c0;
c2 = rv.fetch(x_n, y_n, z) - rv.fetch(x_n, y, z);
c3 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y_n, z);
} else if rx >= rz {
c1 = rv.fetch(x_n, y, z) - c0;
c2 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y, z_n);
c3 = rv.fetch(x_n, y, z_n) - rv.fetch(x_n, y, z);
} else {
c1 = rv.fetch(x_n, y, z_n) - rv.fetch(x, y, z_n);
c2 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y, z_n);
c3 = rv.fetch(x, y, z_n) - c0;
}
} else if rx >= rz {
c1 = rv.fetch(x_n, y_n, z) - rv.fetch(x, y_n, z);
c2 = rv.fetch(x, y_n, z) - c0;
c3 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y_n, z);
} else if ry >= rz {
c1 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x, y_n, z_n);
c2 = rv.fetch(x, y_n, z) - c0;
c3 = rv.fetch(x, y_n, z_n) - rv.fetch(x, y_n, z);
} else {
c1 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x, y_n, z_n);
c2 = rv.fetch(x, y_n, z_n) - rv.fetch(x, y, z_n);
c3 = rv.fetch(x, y, z_n) - c0;
}
let s0 = c0.mla(c1, w0);
let s1 = s0.mla(c2, w1);
s1.mla(c3, w2).split()
}
}
impl<const GRID_SIZE: usize> TrilinearAvxQ0_15Double<GRID_SIZE> {
#[target_feature(enable = "avx2")]
unsafe fn interpolate(
&self,
in_r: usize,
in_g: usize,
in_b: usize,
lut: &[BarycentricWeight<i16>],
rv: impl Fetcher<AvxVectorQ0_15>,
) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
let lut_r = unsafe { *lut.get_unchecked(in_r) };
let lut_g = unsafe { *lut.get_unchecked(in_g) };
let lut_b = unsafe { *lut.get_unchecked(in_b) };
let x: i32 = lut_r.x;
let y: i32 = lut_g.x;
let z: i32 = lut_b.x;
let x_n: i32 = lut_r.x_n;
let y_n: i32 = lut_g.x_n;
let z_n: i32 = lut_b.x_n;
let rx = lut_r.w;
let ry = lut_g.w;
let rz = lut_b.w;
const Q_MAX: i16 = ((1i32 << 15i32) - 1) as i16;
let q_max = AvxVectorQ0_15::from(Q_MAX);
let w0 = AvxVectorQ0_15::from(rx);
let w1 = AvxVectorQ0_15::from(ry);
let w2 = AvxVectorQ0_15::from(rz);
let dx = q_max - w0;
let dy = q_max - w1;
let dz = q_max - w2;
let c000 = rv.fetch(x, y, z);
let c100 = rv.fetch(x_n, y, z);
let c010 = rv.fetch(x, y_n, z);
let c110 = rv.fetch(x_n, y_n, z);
let c001 = rv.fetch(x, y, z_n);
let c101 = rv.fetch(x_n, y, z_n);
let c011 = rv.fetch(x, y_n, z_n);
let c111 = rv.fetch(x_n, y_n, z_n);
let c00 = (c000 * dx).mla(c100, w0);
let c10 = (c010 * dx).mla(c110, w0);
let c01 = (c001 * dx).mla(c101, w0);
let c11 = (c011 * dx).mla(c111, w0);
let c0 = (c00 * dy).mla(c10, w1);
let c1 = (c01 * dy).mla(c11, w1);
(c0 * dz).mla(c1, w2).split()
}
}
impl<const GRID_SIZE: usize> TrilinearAvxQ0_15<GRID_SIZE> {
#[target_feature(enable = "avx2")]
unsafe fn interpolate(
&self,
in_r: usize,
in_g: usize,
in_b: usize,
lut: &[BarycentricWeight<i16>],
r: impl Fetcher<AvxVectorQ0_15Sse>,
) -> AvxVectorQ0_15Sse {
let lut_r = unsafe { *lut.get_unchecked(in_r) };
let lut_g = unsafe { *lut.get_unchecked(in_g) };
let lut_b = unsafe { *lut.get_unchecked(in_b) };
let x: i32 = lut_r.x;
let y: i32 = lut_g.x;
let z: i32 = lut_b.x;
let x_n: i32 = lut_r.x_n;
let y_n: i32 = lut_g.x_n;
let z_n: i32 = lut_b.x_n;
let dr = lut_r.w;
let dg = lut_g.w;
let db = lut_b.w;
const Q_MAX: i16 = ((1i32 << 15i32) - 1) as i16;
let q_max = AvxVectorQ0_15Sse::from(Q_MAX);
let q_max_avx = AvxVectorQ0_15::from(Q_MAX);
let w0 = AvxVectorQ0_15::from(dr);
let w1 = AvxVectorQ0_15::from(dg);
let w2 = AvxVectorQ0_15Sse::from(db);
let dx = q_max_avx - w0;
let dy = q_max_avx - w1;
let dz = q_max - w2;
let c000 = r.fetch(x, y, z);
let c100 = r.fetch(x_n, y, z);
let c010 = r.fetch(x, y_n, z);
let c110 = r.fetch(x_n, y_n, z);
let c001 = r.fetch(x, y, z_n);
let c101 = r.fetch(x_n, y, z_n);
let c011 = r.fetch(x, y_n, z_n);
let c111 = r.fetch(x_n, y_n, z_n);
let x000 = AvxVectorQ0_15::from_sse(c000, c001);
let x010 = AvxVectorQ0_15::from_sse(c010, c011);
let x011 = AvxVectorQ0_15::from_sse(c100, c101);
let x111 = AvxVectorQ0_15::from_sse(c110, c111);
let c00 = (x000 * dx).mla(x011, w0);
let c10 = (x010 * dx).mla(x111, w0);
let c0 = (c00 * dy).mla(c10, w1);
let (c0, c1) = c0.split();
(c0 * dz).mla(c1, w2)
}
}