use std::ops::{
Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Div, DivAssign,
Mul, MulAssign, Neg, Sub, SubAssign,
};
use archmage::intrinsics::wasm32::*;
use crate::U32SimdVec;
use super::super::{F32SimdVec, I32SimdVec, SimdDescriptor, SimdMask, U8SimdVec, U16SimdVec};
#[derive(Clone, Copy, Debug)]
pub struct Wasm128Descriptor(());
impl Wasm128Descriptor {
#[inline]
pub fn from_token(_token: archmage::Wasm128Token) -> Self {
Self(())
}
}
#[derive(Clone, Copy, Debug)]
#[repr(transparent)]
pub struct Bf16Table8Wasm128(v128);
impl SimdDescriptor for Wasm128Descriptor {
type F32Vec = F32VecWasm128;
type I32Vec = I32VecWasm128;
type U32Vec = U32VecWasm128;
type U16Vec = U16VecWasm128;
type U8Vec = U8VecWasm128;
type Mask = MaskWasm128;
type Bf16Table8 = Bf16Table8Wasm128;
type Descriptor256 = Self;
type Descriptor128 = Self;
fn new() -> Option<Self> {
Some(Self(()))
}
#[inline]
fn maybe_downgrade_256bit(self) -> Self {
self
}
#[inline]
fn maybe_downgrade_128bit(self) -> Self {
self
}
fn call<R>(self, f: impl FnOnce(Self) -> R) -> R {
f(self)
}
}
macro_rules! fn_wasm128 {
{} => {};
{$(
fn $name:ident($this:ident: $self_ty:ty $(, $arg:ident: $ty:ty)* $(,)?) $(-> $ret:ty )?
$body: block
)*} => {$(
#[inline(always)]
fn $name(self: $self_ty, $($arg: $ty),*) $(-> $ret)? {
let $this = self;
$body
}
)*};
}
#[derive(Clone, Copy, Debug)]
pub struct F32VecWasm128(v128, Wasm128Descriptor);
impl F32SimdVec for F32VecWasm128 {
type Descriptor = Wasm128Descriptor;
const LEN: usize = 4;
#[inline(always)]
fn splat(d: Self::Descriptor, v: f32) -> Self {
Self(f32x4_splat(v), d)
}
#[inline(always)]
fn zero(d: Self::Descriptor) -> Self {
Self(f32x4_splat(0.0), d)
}
#[inline(always)]
fn load(d: Self::Descriptor, mem: &[f32]) -> Self {
assert!(mem.len() >= Self::LEN);
Self(v128_load(mem.first_chunk::<4>().unwrap()), d)
}
#[inline(always)]
fn store(&self, mem: &mut [f32]) {
assert!(mem.len() >= Self::LEN);
v128_store(mem.first_chunk_mut::<4>().unwrap(), self.0)
}
#[inline(always)]
fn store_interleaved_2(a: Self, b: Self, dest: &mut [f32]) {
assert!(dest.len() >= 2 * Self::LEN);
let lo = i8x16_shuffle::<0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23>(a.0, b.0);
let hi =
i8x16_shuffle::<8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31>(a.0, b.0);
v128_store(dest[..4].first_chunk_mut::<4>().unwrap(), lo);
v128_store(dest[4..].first_chunk_mut::<4>().unwrap(), hi);
}
#[inline(always)]
fn store_interleaved_3(a: Self, b: Self, c: Self, dest: &mut [f32]) {
assert!(dest.len() >= 3 * Self::LEN);
let out0 = f32x4(
f32x4_extract_lane::<0>(a.0),
f32x4_extract_lane::<0>(b.0),
f32x4_extract_lane::<0>(c.0),
f32x4_extract_lane::<1>(a.0),
);
let out1 = f32x4(
f32x4_extract_lane::<1>(b.0),
f32x4_extract_lane::<1>(c.0),
f32x4_extract_lane::<2>(a.0),
f32x4_extract_lane::<2>(b.0),
);
let out2 = f32x4(
f32x4_extract_lane::<2>(c.0),
f32x4_extract_lane::<3>(a.0),
f32x4_extract_lane::<3>(b.0),
f32x4_extract_lane::<3>(c.0),
);
v128_store(dest[..4].first_chunk_mut::<4>().unwrap(), out0);
v128_store(dest[4..8].first_chunk_mut::<4>().unwrap(), out1);
v128_store(dest[8..].first_chunk_mut::<4>().unwrap(), out2);
}
#[inline(always)]
fn store_interleaved_4(a: Self, b: Self, c: Self, d: Self, dest: &mut [f32]) {
assert!(dest.len() >= 4 * Self::LEN);
let ab_lo =
i8x16_shuffle::<0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23>(a.0, b.0);
let ab_hi =
i8x16_shuffle::<8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31>(a.0, b.0);
let cd_lo =
i8x16_shuffle::<0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23>(c.0, d.0);
let cd_hi =
i8x16_shuffle::<8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31>(c.0, d.0);
let out0 =
i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(ab_lo, cd_lo);
let out1 = i8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31>(
ab_lo, cd_lo,
);
let out2 =
i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(ab_hi, cd_hi);
let out3 = i8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31>(
ab_hi, cd_hi,
);
v128_store(dest[..4].first_chunk_mut::<4>().unwrap(), out0);
v128_store(dest[4..8].first_chunk_mut::<4>().unwrap(), out1);
v128_store(dest[8..12].first_chunk_mut::<4>().unwrap(), out2);
v128_store(dest[12..].first_chunk_mut::<4>().unwrap(), out3);
}
#[inline(always)]
fn store_interleaved_8(
a: Self,
b: Self,
c: Self,
d: Self,
e: Self,
f: Self,
g: Self,
h: Self,
dest: &mut [f32],
) {
assert!(dest.len() >= 8 * Self::LEN);
let ab_lo =
i8x16_shuffle::<0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23>(a.0, b.0);
let ab_hi =
i8x16_shuffle::<8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31>(a.0, b.0);
let cd_lo =
i8x16_shuffle::<0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23>(c.0, d.0);
let cd_hi =
i8x16_shuffle::<8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31>(c.0, d.0);
let abcd_0 =
i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(ab_lo, cd_lo);
let abcd_1 = i8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31>(
ab_lo, cd_lo,
);
let abcd_2 =
i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(ab_hi, cd_hi);
let abcd_3 = i8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31>(
ab_hi, cd_hi,
);
let ef_lo =
i8x16_shuffle::<0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23>(e.0, f.0);
let ef_hi =
i8x16_shuffle::<8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31>(e.0, f.0);
let gh_lo =
i8x16_shuffle::<0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23>(g.0, h.0);
let gh_hi =
i8x16_shuffle::<8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31>(g.0, h.0);
let efgh_0 =
i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(ef_lo, gh_lo);
let efgh_1 = i8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31>(
ef_lo, gh_lo,
);
let efgh_2 =
i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(ef_hi, gh_hi);
let efgh_3 = i8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31>(
ef_hi, gh_hi,
);
v128_store(dest[..4].first_chunk_mut::<4>().unwrap(), abcd_0);
v128_store(dest[4..8].first_chunk_mut::<4>().unwrap(), efgh_0);
v128_store(dest[8..12].first_chunk_mut::<4>().unwrap(), abcd_1);
v128_store(dest[12..16].first_chunk_mut::<4>().unwrap(), efgh_1);
v128_store(dest[16..20].first_chunk_mut::<4>().unwrap(), abcd_2);
v128_store(dest[20..24].first_chunk_mut::<4>().unwrap(), efgh_2);
v128_store(dest[24..28].first_chunk_mut::<4>().unwrap(), abcd_3);
v128_store(dest[28..].first_chunk_mut::<4>().unwrap(), efgh_3);
}
#[inline(always)]
fn load_deinterleaved_2(d: Self::Descriptor, src: &[f32]) -> (Self, Self) {
assert!(src.len() >= 2 * Self::LEN);
let lo = v128_load(src[..4].first_chunk::<4>().unwrap());
let hi = v128_load(src[4..].first_chunk::<4>().unwrap());
let a = i8x16_shuffle::<0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27>(lo, hi);
let b = i8x16_shuffle::<4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31>(lo, hi);
(Self(a, d), Self(b, d))
}
#[inline(always)]
fn load_deinterleaved_3(d: Self::Descriptor, src: &[f32]) -> (Self, Self, Self) {
assert!(src.len() >= 3 * Self::LEN);
let v0 = v128_load(src[..4].first_chunk::<4>().unwrap());
let v1 = v128_load(src[4..8].first_chunk::<4>().unwrap());
let v2 = v128_load(src[8..].first_chunk::<4>().unwrap());
let a = f32x4(
f32x4_extract_lane::<0>(v0),
f32x4_extract_lane::<3>(v0),
f32x4_extract_lane::<2>(v1),
f32x4_extract_lane::<1>(v2),
);
let b = f32x4(
f32x4_extract_lane::<1>(v0),
f32x4_extract_lane::<0>(v1),
f32x4_extract_lane::<3>(v1),
f32x4_extract_lane::<2>(v2),
);
let c = f32x4(
f32x4_extract_lane::<2>(v0),
f32x4_extract_lane::<1>(v1),
f32x4_extract_lane::<0>(v2),
f32x4_extract_lane::<3>(v2),
);
(Self(a, d), Self(b, d), Self(c, d))
}
#[inline(always)]
fn load_deinterleaved_4(d: Self::Descriptor, src: &[f32]) -> (Self, Self, Self, Self) {
assert!(src.len() >= 4 * Self::LEN);
let v0 = v128_load(src[..4].first_chunk::<4>().unwrap());
let v1 = v128_load(src[4..8].first_chunk::<4>().unwrap());
let v2 = v128_load(src[8..12].first_chunk::<4>().unwrap());
let v3 = v128_load(src[12..].first_chunk::<4>().unwrap());
let ab_lo = i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(v0, v1);
let cd_lo =
i8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31>(v0, v1);
let ab_hi = i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(v2, v3);
let cd_hi =
i8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31>(v2, v3);
let a =
i8x16_shuffle::<0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27>(ab_lo, ab_hi);
let b = i8x16_shuffle::<4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31>(
ab_lo, ab_hi,
);
let c =
i8x16_shuffle::<0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27>(cd_lo, cd_hi);
let e = i8x16_shuffle::<4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31>(
cd_lo, cd_hi,
);
(Self(a, d), Self(b, d), Self(c, d), Self(e, d))
}
#[inline(always)]
fn transpose_square(_d: Wasm128Descriptor, data: &mut [[f32; 4]], stride: usize) {
assert!(data.len() > 3 * stride);
let p0 = v128_load(&data[0]);
let p1 = v128_load(&data[stride]);
let p2 = v128_load(&data[2 * stride]);
let p3 = v128_load(&data[3 * stride]);
let t0 = i8x16_shuffle::<0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 24, 25, 26, 27>(p0, p1);
let t1 =
i8x16_shuffle::<4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31>(p0, p1);
let t2 = i8x16_shuffle::<0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 24, 25, 26, 27>(p2, p3);
let t3 =
i8x16_shuffle::<4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31>(p2, p3);
let r0 = i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(t0, t2);
let r1 = i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(t1, t3);
let r2 =
i8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31>(t0, t2);
let r3 =
i8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31>(t1, t3);
v128_store(&mut data[0], r0);
v128_store(&mut data[stride], r1);
v128_store(&mut data[2 * stride], r2);
v128_store(&mut data[3 * stride], r3);
}
crate::impl_f32_array_interface!();
fn_wasm128! {
fn mul_add(this: F32VecWasm128, mul: F32VecWasm128, add: F32VecWasm128) -> F32VecWasm128 {
F32VecWasm128(f32x4_add(f32x4_mul(this.0, mul.0), add.0), this.1)
}
fn neg_mul_add(this: F32VecWasm128, mul: F32VecWasm128, add: F32VecWasm128) -> F32VecWasm128 {
F32VecWasm128(f32x4_sub(add.0, f32x4_mul(this.0, mul.0)), this.1)
}
fn abs(this: F32VecWasm128) -> F32VecWasm128 {
F32VecWasm128(f32x4_abs(this.0), this.1)
}
fn floor(this: F32VecWasm128) -> F32VecWasm128 {
F32VecWasm128(f32x4_floor(this.0), this.1)
}
fn sqrt(this: F32VecWasm128) -> F32VecWasm128 {
F32VecWasm128(f32x4_sqrt(this.0), this.1)
}
fn neg(this: F32VecWasm128) -> F32VecWasm128 {
F32VecWasm128(f32x4_neg(this.0), this.1)
}
fn copysign(this: F32VecWasm128, sign: F32VecWasm128) -> F32VecWasm128 {
let sign_mask = i32x4_splat(0x7FFF_FFFFu32 as i32);
let magnitude = v128_and(this.0, sign_mask);
let sign_bit = v128_andnot(sign.0, sign_mask);
F32VecWasm128(v128_or(magnitude, sign_bit), this.1)
}
fn max(this: F32VecWasm128, other: F32VecWasm128) -> F32VecWasm128 {
F32VecWasm128(f32x4_max(this.0, other.0), this.1)
}
fn min(this: F32VecWasm128, other: F32VecWasm128) -> F32VecWasm128 {
F32VecWasm128(f32x4_min(this.0, other.0), this.1)
}
fn gt(this: F32VecWasm128, other: F32VecWasm128) -> MaskWasm128 {
MaskWasm128(f32x4_gt(this.0, other.0), this.1)
}
fn as_i32(this: F32VecWasm128) -> I32VecWasm128 {
I32VecWasm128(i32x4_trunc_sat_f32x4(this.0), this.1)
}
fn bitcast_to_i32(this: F32VecWasm128) -> I32VecWasm128 {
I32VecWasm128(this.0, this.1)
}
fn round_store_u8(this: F32VecWasm128, dest: &mut [u8]) {
assert!(dest.len() >= F32VecWasm128::LEN);
let rounded = f32x4_nearest(this.0);
let i32s = i32x4_trunc_sat_f32x4(rounded);
let zeros = i32x4_splat(0);
let i16s = i16x8_narrow_i32x4(i32s, zeros);
let zeros_i16 = i16x8_splat(0);
let u8s = u8x16_narrow_i16x8(i16s, zeros_i16);
dest[0] = u8x16_extract_lane::<0>(u8s);
dest[1] = u8x16_extract_lane::<1>(u8s);
dest[2] = u8x16_extract_lane::<2>(u8s);
dest[3] = u8x16_extract_lane::<3>(u8s);
}
fn round_store_u16(this: F32VecWasm128, dest: &mut [u16]) {
assert!(dest.len() >= F32VecWasm128::LEN);
let rounded = f32x4_nearest(this.0);
let i32s = i32x4_trunc_sat_f32x4(rounded);
let zeros = i32x4_splat(0);
let u16s = u16x8_narrow_i32x4(i32s, zeros);
dest[0] = u16x8_extract_lane::<0>(u16s);
dest[1] = u16x8_extract_lane::<1>(u16s);
dest[2] = u16x8_extract_lane::<2>(u16s);
dest[3] = u16x8_extract_lane::<3>(u16s);
}
fn store_f16_bits(this: F32VecWasm128, dest: &mut [u16]) {
assert!(dest.len() >= F32VecWasm128::LEN);
let mut arr = [0.0f32; 4];
v128_store(&mut arr, this.0);
dest[0] = crate::f16::from_f32(arr[0]).to_bits();
dest[1] = crate::f16::from_f32(arr[1]).to_bits();
dest[2] = crate::f16::from_f32(arr[2]).to_bits();
dest[3] = crate::f16::from_f32(arr[3]).to_bits();
}
}
#[inline(always)]
fn load_f16_bits(d: Self::Descriptor, mem: &[u16]) -> Self {
assert!(mem.len() >= Self::LEN);
let v0 = crate::f16::from_bits(mem[0]).to_f32();
let v1 = crate::f16::from_bits(mem[1]).to_f32();
let v2 = crate::f16::from_bits(mem[2]).to_f32();
let v3 = crate::f16::from_bits(mem[3]).to_f32();
Self(f32x4(v0, v1, v2, v3), d)
}
#[inline(always)]
fn prepare_table_bf16_8(_d: Wasm128Descriptor, table: &[f32; 8]) -> Bf16Table8Wasm128 {
let table_lo = v128_load(table[..4].first_chunk::<4>().unwrap());
let table_hi = v128_load(table[4..].first_chunk::<4>().unwrap());
let bf16_lo_u32 = u32x4_shr(table_lo, 16);
let bf16_hi_u32 = u32x4_shr(table_hi, 16);
let bf16_table = u16x8_narrow_i32x4(bf16_lo_u32, bf16_hi_u32);
Bf16Table8Wasm128(bf16_table)
}
#[inline(always)]
fn table_lookup_bf16_8(
d: Wasm128Descriptor,
table: Bf16Table8Wasm128,
indices: I32VecWasm128,
) -> Self {
let indices_u32 = indices.0;
let shl17 = i32x4_shl(indices_u32, 17);
let shl25 = i32x4_shl(indices_u32, 25);
let base = i32x4_splat(0x01008080u32 as i32);
let shuffle_mask = v128_or(v128_or(shl17, shl25), base);
let result = i8x16_swizzle(table.0, shuffle_mask);
F32VecWasm128(result, d)
}
}
impl Add<F32VecWasm128> for F32VecWasm128 {
type Output = Self;
fn_wasm128! {
fn add(this: F32VecWasm128, rhs: F32VecWasm128) -> F32VecWasm128 {
F32VecWasm128(f32x4_add(this.0, rhs.0), this.1)
}
}
}
impl Sub<F32VecWasm128> for F32VecWasm128 {
type Output = Self;
fn_wasm128! {
fn sub(this: F32VecWasm128, rhs: F32VecWasm128) -> F32VecWasm128 {
F32VecWasm128(f32x4_sub(this.0, rhs.0), this.1)
}
}
}
impl Mul<F32VecWasm128> for F32VecWasm128 {
type Output = Self;
fn_wasm128! {
fn mul(this: F32VecWasm128, rhs: F32VecWasm128) -> F32VecWasm128 {
F32VecWasm128(f32x4_mul(this.0, rhs.0), this.1)
}
}
}
impl Div<F32VecWasm128> for F32VecWasm128 {
type Output = Self;
fn_wasm128! {
fn div(this: F32VecWasm128, rhs: F32VecWasm128) -> F32VecWasm128 {
F32VecWasm128(f32x4_div(this.0, rhs.0), this.1)
}
}
}
impl AddAssign<F32VecWasm128> for F32VecWasm128 {
fn_wasm128! {
fn add_assign(this: &mut F32VecWasm128, rhs: F32VecWasm128) {
this.0 = f32x4_add(this.0, rhs.0);
}
}
}
impl SubAssign<F32VecWasm128> for F32VecWasm128 {
fn_wasm128! {
fn sub_assign(this: &mut F32VecWasm128, rhs: F32VecWasm128) {
this.0 = f32x4_sub(this.0, rhs.0);
}
}
}
impl MulAssign<F32VecWasm128> for F32VecWasm128 {
fn_wasm128! {
fn mul_assign(this: &mut F32VecWasm128, rhs: F32VecWasm128) {
this.0 = f32x4_mul(this.0, rhs.0);
}
}
}
impl DivAssign<F32VecWasm128> for F32VecWasm128 {
fn_wasm128! {
fn div_assign(this: &mut F32VecWasm128, rhs: F32VecWasm128) {
this.0 = f32x4_div(this.0, rhs.0);
}
}
}
#[derive(Clone, Copy, Debug)]
pub struct I32VecWasm128(v128, Wasm128Descriptor);
impl I32SimdVec for I32VecWasm128 {
type Descriptor = Wasm128Descriptor;
const LEN: usize = 4;
#[inline(always)]
fn splat(d: Self::Descriptor, v: i32) -> Self {
Self(i32x4_splat(v), d)
}
#[inline(always)]
fn load(d: Self::Descriptor, mem: &[i32]) -> Self {
assert!(mem.len() >= Self::LEN);
Self(v128_load(mem.first_chunk::<4>().unwrap()), d)
}
#[inline(always)]
fn store(&self, mem: &mut [i32]) {
assert!(mem.len() >= Self::LEN);
v128_store(mem.first_chunk_mut::<4>().unwrap(), self.0)
}
fn_wasm128! {
fn abs(this: I32VecWasm128) -> I32VecWasm128 {
I32VecWasm128(i32x4_abs(this.0), this.1)
}
fn as_f32(this: I32VecWasm128) -> F32VecWasm128 {
F32VecWasm128(f32x4_convert_i32x4(this.0), this.1)
}
fn bitcast_to_f32(this: I32VecWasm128) -> F32VecWasm128 {
F32VecWasm128(this.0, this.1)
}
fn bitcast_to_u32(this: I32VecWasm128) -> U32VecWasm128 {
U32VecWasm128(this.0, this.1)
}
fn gt(this: I32VecWasm128, other: I32VecWasm128) -> MaskWasm128 {
MaskWasm128(i32x4_gt(this.0, other.0), this.1)
}
fn lt_zero(this: I32VecWasm128) -> MaskWasm128 {
MaskWasm128(i32x4_lt(this.0, i32x4_splat(0)), this.1)
}
fn eq(this: I32VecWasm128, other: I32VecWasm128) -> MaskWasm128 {
MaskWasm128(i32x4_eq(this.0, other.0), this.1)
}
fn eq_zero(this: I32VecWasm128) -> MaskWasm128 {
MaskWasm128(i32x4_eq(this.0, i32x4_splat(0)), this.1)
}
fn mul_wide_take_high(this: I32VecWasm128, rhs: I32VecWasm128) -> I32VecWasm128 {
let lo = i64x2_extmul_low_i32x4(this.0, rhs.0);
let hi = i64x2_extmul_high_i32x4(this.0, rhs.0);
let lo_high = i64x2_shr(lo, 32);
let hi_high = i64x2_shr(hi, 32);
let result = i8x16_shuffle::<
0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27,
>(lo_high, hi_high);
I32VecWasm128(result, this.1)
}
}
#[inline(always)]
fn shl<const AMOUNT_U: u32, const AMOUNT_I: i32>(self) -> Self {
Self(i32x4_shl(self.0, AMOUNT_U), self.1)
}
#[inline(always)]
fn shr<const AMOUNT_U: u32, const AMOUNT_I: i32>(self) -> Self {
Self(i32x4_shr(self.0, AMOUNT_U), self.1)
}
#[inline(always)]
fn store_u16(self, dest: &mut [u16]) {
assert!(dest.len() >= Self::LEN);
let lane0 = i32x4_extract_lane::<0>(self.0) as u16;
let lane1 = i32x4_extract_lane::<1>(self.0) as u16;
let lane2 = i32x4_extract_lane::<2>(self.0) as u16;
let lane3 = i32x4_extract_lane::<3>(self.0) as u16;
dest[0] = lane0;
dest[1] = lane1;
dest[2] = lane2;
dest[3] = lane3;
}
#[inline(always)]
fn store_u8(self, dest: &mut [u8]) {
assert!(dest.len() >= Self::LEN);
let lane0 = i32x4_extract_lane::<0>(self.0) as u8;
let lane1 = i32x4_extract_lane::<1>(self.0) as u8;
let lane2 = i32x4_extract_lane::<2>(self.0) as u8;
let lane3 = i32x4_extract_lane::<3>(self.0) as u8;
dest[0] = lane0;
dest[1] = lane1;
dest[2] = lane2;
dest[3] = lane3;
}
}
impl Add<I32VecWasm128> for I32VecWasm128 {
type Output = I32VecWasm128;
fn_wasm128! {
fn add(this: I32VecWasm128, rhs: I32VecWasm128) -> I32VecWasm128 {
I32VecWasm128(i32x4_add(this.0, rhs.0), this.1)
}
}
}
impl Sub<I32VecWasm128> for I32VecWasm128 {
type Output = I32VecWasm128;
fn_wasm128! {
fn sub(this: I32VecWasm128, rhs: I32VecWasm128) -> I32VecWasm128 {
I32VecWasm128(i32x4_sub(this.0, rhs.0), this.1)
}
}
}
impl Mul<I32VecWasm128> for I32VecWasm128 {
type Output = I32VecWasm128;
fn_wasm128! {
fn mul(this: I32VecWasm128, rhs: I32VecWasm128) -> I32VecWasm128 {
I32VecWasm128(i32x4_mul(this.0, rhs.0), this.1)
}
}
}
impl Neg for I32VecWasm128 {
type Output = I32VecWasm128;
fn_wasm128! {
fn neg(this: I32VecWasm128) -> I32VecWasm128 {
I32VecWasm128(i32x4_neg(this.0), this.1)
}
}
}
impl BitAnd<I32VecWasm128> for I32VecWasm128 {
type Output = I32VecWasm128;
fn_wasm128! {
fn bitand(this: I32VecWasm128, rhs: I32VecWasm128) -> I32VecWasm128 {
I32VecWasm128(v128_and(this.0, rhs.0), this.1)
}
}
}
impl BitOr<I32VecWasm128> for I32VecWasm128 {
type Output = I32VecWasm128;
fn_wasm128! {
fn bitor(this: I32VecWasm128, rhs: I32VecWasm128) -> I32VecWasm128 {
I32VecWasm128(v128_or(this.0, rhs.0), this.1)
}
}
}
impl BitXor<I32VecWasm128> for I32VecWasm128 {
type Output = I32VecWasm128;
fn_wasm128! {
fn bitxor(this: I32VecWasm128, rhs: I32VecWasm128) -> I32VecWasm128 {
I32VecWasm128(v128_xor(this.0, rhs.0), this.1)
}
}
}
impl AddAssign<I32VecWasm128> for I32VecWasm128 {
fn_wasm128! {
fn add_assign(this: &mut I32VecWasm128, rhs: I32VecWasm128) {
this.0 = i32x4_add(this.0, rhs.0);
}
}
}
impl SubAssign<I32VecWasm128> for I32VecWasm128 {
fn_wasm128! {
fn sub_assign(this: &mut I32VecWasm128, rhs: I32VecWasm128) {
this.0 = i32x4_sub(this.0, rhs.0);
}
}
}
impl MulAssign<I32VecWasm128> for I32VecWasm128 {
fn_wasm128! {
fn mul_assign(this: &mut I32VecWasm128, rhs: I32VecWasm128) {
this.0 = i32x4_mul(this.0, rhs.0);
}
}
}
impl BitAndAssign<I32VecWasm128> for I32VecWasm128 {
fn_wasm128! {
fn bitand_assign(this: &mut I32VecWasm128, rhs: I32VecWasm128) {
this.0 = v128_and(this.0, rhs.0);
}
}
}
impl BitOrAssign<I32VecWasm128> for I32VecWasm128 {
fn_wasm128! {
fn bitor_assign(this: &mut I32VecWasm128, rhs: I32VecWasm128) {
this.0 = v128_or(this.0, rhs.0);
}
}
}
impl BitXorAssign<I32VecWasm128> for I32VecWasm128 {
fn_wasm128! {
fn bitxor_assign(this: &mut I32VecWasm128, rhs: I32VecWasm128) {
this.0 = v128_xor(this.0, rhs.0);
}
}
}
#[derive(Clone, Copy, Debug)]
pub struct U32VecWasm128(v128, Wasm128Descriptor);
impl U32SimdVec for U32VecWasm128 {
type Descriptor = Wasm128Descriptor;
const LEN: usize = 4;
fn_wasm128! {
fn bitcast_to_i32(this: U32VecWasm128) -> I32VecWasm128 {
I32VecWasm128(this.0, this.1)
}
}
#[inline(always)]
fn shr<const AMOUNT_U: u32, const AMOUNT_I: i32>(self) -> Self {
Self(u32x4_shr(self.0, AMOUNT_U), self.1)
}
}
#[derive(Clone, Copy, Debug)]
pub struct U8VecWasm128(v128, Wasm128Descriptor);
impl U8SimdVec for U8VecWasm128 {
type Descriptor = Wasm128Descriptor;
const LEN: usize = 16;
#[inline(always)]
fn load(d: Self::Descriptor, mem: &[u8]) -> Self {
assert!(mem.len() >= Self::LEN);
Self(v128_load(mem.first_chunk::<16>().unwrap()), d)
}
#[inline(always)]
fn splat(d: Self::Descriptor, v: u8) -> Self {
Self(u8x16_splat(v), d)
}
#[inline(always)]
fn store(&self, mem: &mut [u8]) {
assert!(mem.len() >= Self::LEN);
v128_store(mem.first_chunk_mut::<16>().unwrap(), self.0)
}
#[inline(always)]
fn store_interleaved_2(a: Self, b: Self, dest: &mut [u8]) {
assert!(dest.len() >= 2 * Self::LEN);
let lo = i8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(a.0, b.0);
let hi =
i8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(a.0, b.0);
v128_store(dest[..16].first_chunk_mut::<16>().unwrap(), lo);
v128_store(dest[16..].first_chunk_mut::<16>().unwrap(), hi);
}
#[inline(always)]
fn store_interleaved_3(a: Self, b: Self, c: Self, dest: &mut [u8]) {
assert!(dest.len() >= 3 * Self::LEN);
let mut a_arr = [0u8; 16];
let mut b_arr = [0u8; 16];
let mut c_arr = [0u8; 16];
v128_store(&mut a_arr, a.0);
v128_store(&mut b_arr, b.0);
v128_store(&mut c_arr, c.0);
let mut out = [0u8; 48];
for i in 0..16 {
out[3 * i] = a_arr[i];
out[3 * i + 1] = b_arr[i];
out[3 * i + 2] = c_arr[i];
}
dest[..48].copy_from_slice(&out);
}
#[inline(always)]
fn store_interleaved_4(a: Self, b: Self, c: Self, d: Self, dest: &mut [u8]) {
assert!(dest.len() >= 4 * Self::LEN);
let ab_lo =
i8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(a.0, b.0);
let ab_hi =
i8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(a.0, b.0);
let cd_lo =
i8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(c.0, d.0);
let cd_hi =
i8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(c.0, d.0);
let out0 =
i8x16_shuffle::<0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23>(ab_lo, cd_lo);
let out1 = i8x16_shuffle::<8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31>(
ab_lo, cd_lo,
);
let out2 =
i8x16_shuffle::<0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23>(ab_hi, cd_hi);
let out3 = i8x16_shuffle::<8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31>(
ab_hi, cd_hi,
);
v128_store(dest[..16].first_chunk_mut::<16>().unwrap(), out0);
v128_store(dest[16..32].first_chunk_mut::<16>().unwrap(), out1);
v128_store(dest[32..48].first_chunk_mut::<16>().unwrap(), out2);
v128_store(dest[48..].first_chunk_mut::<16>().unwrap(), out3);
}
}
#[derive(Clone, Copy, Debug)]
pub struct U16VecWasm128(v128, Wasm128Descriptor);
impl U16SimdVec for U16VecWasm128 {
type Descriptor = Wasm128Descriptor;
const LEN: usize = 8;
#[inline(always)]
fn load(d: Self::Descriptor, mem: &[u16]) -> Self {
assert!(mem.len() >= Self::LEN);
Self(v128_load(mem.first_chunk::<8>().unwrap()), d)
}
#[inline(always)]
fn splat(d: Self::Descriptor, v: u16) -> Self {
Self(u16x8_splat(v), d)
}
#[inline(always)]
fn store(&self, mem: &mut [u16]) {
assert!(mem.len() >= Self::LEN);
v128_store(mem.first_chunk_mut::<8>().unwrap(), self.0)
}
#[inline(always)]
fn store_interleaved_2(a: Self, b: Self, dest: &mut [u16]) {
assert!(dest.len() >= 2 * Self::LEN);
let lo = i8x16_shuffle::<0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23>(a.0, b.0);
let hi =
i8x16_shuffle::<8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31>(a.0, b.0);
v128_store(dest[..8].first_chunk_mut::<8>().unwrap(), lo);
v128_store(dest[8..].first_chunk_mut::<8>().unwrap(), hi);
}
#[inline(always)]
fn store_interleaved_3(a: Self, b: Self, c: Self, dest: &mut [u16]) {
assert!(dest.len() >= 3 * Self::LEN);
let mut a_arr = [0u16; 8];
let mut b_arr = [0u16; 8];
let mut c_arr = [0u16; 8];
v128_store(&mut a_arr, a.0);
v128_store(&mut b_arr, b.0);
v128_store(&mut c_arr, c.0);
let mut out = [0u16; 24];
for i in 0..8 {
out[3 * i] = a_arr[i];
out[3 * i + 1] = b_arr[i];
out[3 * i + 2] = c_arr[i];
}
dest[..24].copy_from_slice(&out);
}
#[inline(always)]
fn store_interleaved_4(a: Self, b: Self, c: Self, d: Self, dest: &mut [u16]) {
assert!(dest.len() >= 4 * Self::LEN);
let ab_lo =
i8x16_shuffle::<0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23>(a.0, b.0);
let ab_hi =
i8x16_shuffle::<8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31>(a.0, b.0);
let cd_lo =
i8x16_shuffle::<0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23>(c.0, d.0);
let cd_hi =
i8x16_shuffle::<8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31>(c.0, d.0);
let out0 =
i8x16_shuffle::<0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23>(ab_lo, cd_lo);
let out1 = i8x16_shuffle::<8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31>(
ab_lo, cd_lo,
);
let out2 =
i8x16_shuffle::<0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23>(ab_hi, cd_hi);
let out3 = i8x16_shuffle::<8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31>(
ab_hi, cd_hi,
);
v128_store(dest[..8].first_chunk_mut::<8>().unwrap(), out0);
v128_store(dest[8..16].first_chunk_mut::<8>().unwrap(), out1);
v128_store(dest[16..24].first_chunk_mut::<8>().unwrap(), out2);
v128_store(dest[24..].first_chunk_mut::<8>().unwrap(), out3);
}
}
#[derive(Clone, Copy, Debug)]
pub struct MaskWasm128(v128, Wasm128Descriptor);
impl SimdMask for MaskWasm128 {
type Descriptor = Wasm128Descriptor;
fn_wasm128! {
fn if_then_else_f32(
this: MaskWasm128,
if_true: F32VecWasm128,
if_false: F32VecWasm128,
) -> F32VecWasm128 {
F32VecWasm128(v128_bitselect(if_true.0, if_false.0, this.0), this.1)
}
fn if_then_else_i32(
this: MaskWasm128,
if_true: I32VecWasm128,
if_false: I32VecWasm128,
) -> I32VecWasm128 {
I32VecWasm128(v128_bitselect(if_true.0, if_false.0, this.0), this.1)
}
fn maskz_i32(this: MaskWasm128, v: I32VecWasm128) -> I32VecWasm128 {
I32VecWasm128(v128_andnot(v.0, this.0), this.1)
}
fn andnot(this: MaskWasm128, rhs: MaskWasm128) -> MaskWasm128 {
MaskWasm128(v128_andnot(rhs.0, this.0), this.1)
}
fn all(this: MaskWasm128) -> bool {
i32x4_all_true(this.0)
}
}
}
impl BitAnd<MaskWasm128> for MaskWasm128 {
type Output = MaskWasm128;
fn_wasm128! {
fn bitand(this: MaskWasm128, rhs: MaskWasm128) -> MaskWasm128 {
MaskWasm128(v128_and(this.0, rhs.0), this.1)
}
}
}
impl BitOr<MaskWasm128> for MaskWasm128 {
type Output = MaskWasm128;
fn_wasm128! {
fn bitor(this: MaskWasm128, rhs: MaskWasm128) -> MaskWasm128 {
MaskWasm128(v128_or(this.0, rhs.0), this.1)
}
}
}