#![allow(unsafe_code)]
use crate::simd::scalar::ScalarFallback;
use crate::simd::traits::{SimdOps, SimdOpsExt};
use crate::simd::types::{I16x8, I32x4, U8x16};
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
use core::arch::wasm32::*;
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
pub struct WasmSimdInfo {
pub simd128_available: bool,
pub relaxed_simd_available: bool,
}
impl WasmSimdInfo {
pub fn detect() -> Self {
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
Self {
simd128_available: true,
relaxed_simd_available: false,
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
{
Self {
simd128_available: false,
relaxed_simd_available: false,
}
}
}
}
#[derive(Clone, Copy, Debug)]
pub struct WasmSimd {
info: WasmSimdInfo,
fallback: ScalarFallback,
}
impl WasmSimd {
#[inline]
#[must_use]
pub fn new() -> Self {
Self {
info: WasmSimdInfo::detect(),
fallback: ScalarFallback::new(),
}
}
#[inline]
pub fn info(&self) -> WasmSimdInfo {
self.info
}
}
impl Default for WasmSimd {
fn default() -> Self {
Self::new()
}
}
impl SimdOps for WasmSimd {
#[inline]
fn name(&self) -> &'static str {
if self.info.simd128_available {
"wasm32-simd128"
} else {
"wasm32-scalar-fallback"
}
}
#[inline]
fn is_available(&self) -> bool {
self.info.simd128_available
}
#[inline]
fn add_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
unsafe {
let va = i16x8(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
let vb = i16x8(b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]);
let vr = i16x8_add(va, vb);
I16x8::from_array([
i16x8_extract_lane::<0>(vr),
i16x8_extract_lane::<1>(vr),
i16x8_extract_lane::<2>(vr),
i16x8_extract_lane::<3>(vr),
i16x8_extract_lane::<4>(vr),
i16x8_extract_lane::<5>(vr),
i16x8_extract_lane::<6>(vr),
i16x8_extract_lane::<7>(vr),
])
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
{
self.fallback.add_i16x8(a, b)
}
}
#[inline]
fn sub_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
unsafe {
let va = i16x8(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
let vb = i16x8(b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]);
let vr = i16x8_sub(va, vb);
I16x8::from_array([
i16x8_extract_lane::<0>(vr),
i16x8_extract_lane::<1>(vr),
i16x8_extract_lane::<2>(vr),
i16x8_extract_lane::<3>(vr),
i16x8_extract_lane::<4>(vr),
i16x8_extract_lane::<5>(vr),
i16x8_extract_lane::<6>(vr),
i16x8_extract_lane::<7>(vr),
])
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
{
self.fallback.sub_i16x8(a, b)
}
}
#[inline]
fn mul_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
unsafe {
let va = i16x8(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
let vb = i16x8(b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]);
let vr = i16x8_mul(va, vb);
I16x8::from_array([
i16x8_extract_lane::<0>(vr),
i16x8_extract_lane::<1>(vr),
i16x8_extract_lane::<2>(vr),
i16x8_extract_lane::<3>(vr),
i16x8_extract_lane::<4>(vr),
i16x8_extract_lane::<5>(vr),
i16x8_extract_lane::<6>(vr),
i16x8_extract_lane::<7>(vr),
])
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
{
self.fallback.mul_i16x8(a, b)
}
}
#[inline]
fn add_i32x4(&self, a: I32x4, b: I32x4) -> I32x4 {
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
unsafe {
let va = i32x4(a[0], a[1], a[2], a[3]);
let vb = i32x4(b[0], b[1], b[2], b[3]);
let vr = i32x4_add(va, vb);
I32x4::from_array([
i32x4_extract_lane::<0>(vr),
i32x4_extract_lane::<1>(vr),
i32x4_extract_lane::<2>(vr),
i32x4_extract_lane::<3>(vr),
])
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
{
self.fallback.add_i32x4(a, b)
}
}
#[inline]
fn sub_i32x4(&self, a: I32x4, b: I32x4) -> I32x4 {
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
unsafe {
let va = i32x4(a[0], a[1], a[2], a[3]);
let vb = i32x4(b[0], b[1], b[2], b[3]);
let vr = i32x4_sub(va, vb);
I32x4::from_array([
i32x4_extract_lane::<0>(vr),
i32x4_extract_lane::<1>(vr),
i32x4_extract_lane::<2>(vr),
i32x4_extract_lane::<3>(vr),
])
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
{
self.fallback.sub_i32x4(a, b)
}
}
#[inline]
fn min_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
unsafe {
let va = i16x8(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
let vb = i16x8(b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]);
let vr = i16x8_min(va, vb);
I16x8::from_array([
i16x8_extract_lane::<0>(vr),
i16x8_extract_lane::<1>(vr),
i16x8_extract_lane::<2>(vr),
i16x8_extract_lane::<3>(vr),
i16x8_extract_lane::<4>(vr),
i16x8_extract_lane::<5>(vr),
i16x8_extract_lane::<6>(vr),
i16x8_extract_lane::<7>(vr),
])
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
{
self.fallback.min_i16x8(a, b)
}
}
#[inline]
fn max_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
unsafe {
let va = i16x8(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
let vb = i16x8(b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]);
let vr = i16x8_max(va, vb);
I16x8::from_array([
i16x8_extract_lane::<0>(vr),
i16x8_extract_lane::<1>(vr),
i16x8_extract_lane::<2>(vr),
i16x8_extract_lane::<3>(vr),
i16x8_extract_lane::<4>(vr),
i16x8_extract_lane::<5>(vr),
i16x8_extract_lane::<6>(vr),
i16x8_extract_lane::<7>(vr),
])
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
{
self.fallback.max_i16x8(a, b)
}
}
#[inline]
fn clamp_i16x8(&self, v: I16x8, min: i16, max: i16) -> I16x8 {
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
unsafe {
let vv = i16x8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
let vmin = i16x8_splat(min);
let vmax = i16x8_splat(max);
let vr = i16x8_min(i16x8_max(vv, vmin), vmax);
I16x8::from_array([
i16x8_extract_lane::<0>(vr),
i16x8_extract_lane::<1>(vr),
i16x8_extract_lane::<2>(vr),
i16x8_extract_lane::<3>(vr),
i16x8_extract_lane::<4>(vr),
i16x8_extract_lane::<5>(vr),
i16x8_extract_lane::<6>(vr),
i16x8_extract_lane::<7>(vr),
])
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
{
self.fallback.clamp_i16x8(v, min, max)
}
}
#[inline]
fn min_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
unsafe {
let va = u8x16(
a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], a[9], a[10], a[11],
a[12], a[13], a[14], a[15],
);
let vb = u8x16(
b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7], b[8], b[9], b[10], b[11],
b[12], b[13], b[14], b[15],
);
let vr = u8x16_min(va, vb);
U8x16::from_array([
u8x16_extract_lane::<0>(vr),
u8x16_extract_lane::<1>(vr),
u8x16_extract_lane::<2>(vr),
u8x16_extract_lane::<3>(vr),
u8x16_extract_lane::<4>(vr),
u8x16_extract_lane::<5>(vr),
u8x16_extract_lane::<6>(vr),
u8x16_extract_lane::<7>(vr),
u8x16_extract_lane::<8>(vr),
u8x16_extract_lane::<9>(vr),
u8x16_extract_lane::<10>(vr),
u8x16_extract_lane::<11>(vr),
u8x16_extract_lane::<12>(vr),
u8x16_extract_lane::<13>(vr),
u8x16_extract_lane::<14>(vr),
u8x16_extract_lane::<15>(vr),
])
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
{
self.fallback.min_u8x16(a, b)
}
}
#[inline]
fn max_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
unsafe {
let va = u8x16(
a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], a[9], a[10], a[11],
a[12], a[13], a[14], a[15],
);
let vb = u8x16(
b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7], b[8], b[9], b[10], b[11],
b[12], b[13], b[14], b[15],
);
let vr = u8x16_max(va, vb);
U8x16::from_array([
u8x16_extract_lane::<0>(vr),
u8x16_extract_lane::<1>(vr),
u8x16_extract_lane::<2>(vr),
u8x16_extract_lane::<3>(vr),
u8x16_extract_lane::<4>(vr),
u8x16_extract_lane::<5>(vr),
u8x16_extract_lane::<6>(vr),
u8x16_extract_lane::<7>(vr),
u8x16_extract_lane::<8>(vr),
u8x16_extract_lane::<9>(vr),
u8x16_extract_lane::<10>(vr),
u8x16_extract_lane::<11>(vr),
u8x16_extract_lane::<12>(vr),
u8x16_extract_lane::<13>(vr),
u8x16_extract_lane::<14>(vr),
u8x16_extract_lane::<15>(vr),
])
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
{
self.fallback.max_u8x16(a, b)
}
}
#[inline]
fn clamp_u8x16(&self, v: U8x16, min: u8, max: u8) -> U8x16 {
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
unsafe {
let vv = u8x16(
v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7], v[8], v[9], v[10], v[11],
v[12], v[13], v[14], v[15],
);
let vmin = u8x16_splat(min);
let vmax = u8x16_splat(max);
let vr = u8x16_min(u8x16_max(vv, vmin), vmax);
U8x16::from_array([
u8x16_extract_lane::<0>(vr),
u8x16_extract_lane::<1>(vr),
u8x16_extract_lane::<2>(vr),
u8x16_extract_lane::<3>(vr),
u8x16_extract_lane::<4>(vr),
u8x16_extract_lane::<5>(vr),
u8x16_extract_lane::<6>(vr),
u8x16_extract_lane::<7>(vr),
u8x16_extract_lane::<8>(vr),
u8x16_extract_lane::<9>(vr),
u8x16_extract_lane::<10>(vr),
u8x16_extract_lane::<11>(vr),
u8x16_extract_lane::<12>(vr),
u8x16_extract_lane::<13>(vr),
u8x16_extract_lane::<14>(vr),
u8x16_extract_lane::<15>(vr),
])
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
{
self.fallback.clamp_u8x16(v, min, max)
}
}
#[inline]
fn horizontal_sum_i16x8(&self, v: I16x8) -> i32 {
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
unsafe {
let vv = i16x8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
let zero = i16x8_splat(0);
let lo = i32x4_extend_low_i16x8(vv);
let hi = i32x4_extend_high_i16x8(vv);
let _ = zero; let sum4 = i32x4_add(lo, hi);
i32x4_extract_lane::<0>(sum4)
.wrapping_add(i32x4_extract_lane::<1>(sum4))
.wrapping_add(i32x4_extract_lane::<2>(sum4))
.wrapping_add(i32x4_extract_lane::<3>(sum4))
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
{
self.fallback.horizontal_sum_i16x8(v)
}
}
#[inline]
fn horizontal_sum_i32x4(&self, v: I32x4) -> i32 {
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
unsafe {
let vv = i32x4(v[0], v[1], v[2], v[3]);
i32x4_extract_lane::<0>(vv)
.wrapping_add(i32x4_extract_lane::<1>(vv))
.wrapping_add(i32x4_extract_lane::<2>(vv))
.wrapping_add(i32x4_extract_lane::<3>(vv))
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
{
self.fallback.horizontal_sum_i32x4(v)
}
}
#[inline]
fn sad_u8x16(&self, a: U8x16, b: U8x16) -> u32 {
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
unsafe {
let va = u8x16(
a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], a[9], a[10], a[11],
a[12], a[13], a[14], a[15],
);
let vb = u8x16(
b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7], b[8], b[9], b[10], b[11],
b[12], b[13], b[14], b[15],
);
let diff = u8x16_sub(u8x16_max(va, vb), u8x16_min(va, vb));
let lo = i16x8_extend_low_u8x16(diff);
let hi = i16x8_extend_high_u8x16(diff);
let sum_lo = i32x4_extend_low_i16x8(lo);
let sum_hi = i32x4_extend_high_i16x8(lo);
let sum_lo2 = i32x4_extend_low_i16x8(hi);
let sum_hi2 = i32x4_extend_high_i16x8(hi);
let acc1 = i32x4_add(sum_lo, sum_hi);
let acc2 = i32x4_add(sum_lo2, sum_hi2);
let acc = i32x4_add(acc1, acc2);
let s = i32x4_extract_lane::<0>(acc)
.wrapping_add(i32x4_extract_lane::<1>(acc))
.wrapping_add(i32x4_extract_lane::<2>(acc))
.wrapping_add(i32x4_extract_lane::<3>(acc));
s as u32
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
{
self.fallback.sad_u8x16(a, b)
}
}
#[inline]
fn sad_8(&self, a: &[u8], b: &[u8]) -> u32 {
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
if a.len() < 8 || b.len() < 8 {
return self.fallback.sad_8(a, b);
}
unsafe {
let va = u8x16(
a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7],
0, 0, 0, 0, 0, 0, 0, 0,
);
let vb = u8x16(
b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7],
0, 0, 0, 0, 0, 0, 0, 0,
);
let diff = u8x16_sub(u8x16_max(va, vb), u8x16_min(va, vb));
let lo = i16x8_extend_low_u8x16(diff);
let acc = i32x4_add(
i32x4_extend_low_i16x8(lo),
i32x4_extend_high_i16x8(lo),
);
let s = i32x4_extract_lane::<0>(acc)
.wrapping_add(i32x4_extract_lane::<1>(acc))
.wrapping_add(i32x4_extract_lane::<2>(acc))
.wrapping_add(i32x4_extract_lane::<3>(acc));
s as u32
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
{
self.fallback.sad_8(a, b)
}
}
#[inline]
fn sad_16(&self, a: &[u8], b: &[u8]) -> u32 {
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
if a.len() < 16 || b.len() < 16 {
return self.fallback.sad_16(a, b);
}
unsafe {
let va = u8x16(
a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], a[9], a[10], a[11],
a[12], a[13], a[14], a[15],
);
let vb = u8x16(
b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7], b[8], b[9], b[10], b[11],
b[12], b[13], b[14], b[15],
);
let diff = u8x16_sub(u8x16_max(va, vb), u8x16_min(va, vb));
let lo = i16x8_extend_low_u8x16(diff);
let hi = i16x8_extend_high_u8x16(diff);
let acc = i32x4_add(
i32x4_add(i32x4_extend_low_i16x8(lo), i32x4_extend_high_i16x8(lo)),
i32x4_add(i32x4_extend_low_i16x8(hi), i32x4_extend_high_i16x8(hi)),
);
let s = i32x4_extract_lane::<0>(acc)
.wrapping_add(i32x4_extract_lane::<1>(acc))
.wrapping_add(i32x4_extract_lane::<2>(acc))
.wrapping_add(i32x4_extract_lane::<3>(acc));
s as u32
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
{
self.fallback.sad_16(a, b)
}
}
#[inline]
fn widen_low_u8_to_i16(&self, v: U8x16) -> I16x8 {
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
unsafe {
let vv = u8x16(
v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7], v[8], v[9], v[10], v[11],
v[12], v[13], v[14], v[15],
);
let vr = i16x8_extend_low_u8x16(vv);
I16x8::from_array([
i16x8_extract_lane::<0>(vr),
i16x8_extract_lane::<1>(vr),
i16x8_extract_lane::<2>(vr),
i16x8_extract_lane::<3>(vr),
i16x8_extract_lane::<4>(vr),
i16x8_extract_lane::<5>(vr),
i16x8_extract_lane::<6>(vr),
i16x8_extract_lane::<7>(vr),
])
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
{
self.fallback.widen_low_u8_to_i16(v)
}
}
#[inline]
fn widen_high_u8_to_i16(&self, v: U8x16) -> I16x8 {
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
unsafe {
let vv = u8x16(
v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7], v[8], v[9], v[10], v[11],
v[12], v[13], v[14], v[15],
);
let vr = i16x8_extend_high_u8x16(vv);
I16x8::from_array([
i16x8_extract_lane::<0>(vr),
i16x8_extract_lane::<1>(vr),
i16x8_extract_lane::<2>(vr),
i16x8_extract_lane::<3>(vr),
i16x8_extract_lane::<4>(vr),
i16x8_extract_lane::<5>(vr),
i16x8_extract_lane::<6>(vr),
i16x8_extract_lane::<7>(vr),
])
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
{
self.fallback.widen_high_u8_to_i16(v)
}
}
#[inline]
fn narrow_i32x4_to_i16x8(&self, low: I32x4, high: I32x4) -> I16x8 {
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
unsafe {
let vlo = i32x4(low[0], low[1], low[2], low[3]);
let vhi = i32x4(high[0], high[1], high[2], high[3]);
let vr = i16x8_narrow_i32x4(vlo, vhi);
I16x8::from_array([
i16x8_extract_lane::<0>(vr),
i16x8_extract_lane::<1>(vr),
i16x8_extract_lane::<2>(vr),
i16x8_extract_lane::<3>(vr),
i16x8_extract_lane::<4>(vr),
i16x8_extract_lane::<5>(vr),
i16x8_extract_lane::<6>(vr),
i16x8_extract_lane::<7>(vr),
])
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
{
self.fallback.narrow_i32x4_to_i16x8(low, high)
}
}
#[inline]
fn madd_i16x8(&self, a: I16x8, b: I16x8, c: I16x8) -> I16x8 {
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
unsafe {
let va = i16x8(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
let vb = i16x8(b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]);
let vc = i16x8(c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7]);
let vr = i16x8_add(i16x8_mul(va, vb), vc);
I16x8::from_array([
i16x8_extract_lane::<0>(vr),
i16x8_extract_lane::<1>(vr),
i16x8_extract_lane::<2>(vr),
i16x8_extract_lane::<3>(vr),
i16x8_extract_lane::<4>(vr),
i16x8_extract_lane::<5>(vr),
i16x8_extract_lane::<6>(vr),
i16x8_extract_lane::<7>(vr),
])
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
{
self.fallback.madd_i16x8(a, b, c)
}
}
#[inline]
fn pmaddwd(&self, a: I16x8, b: I16x8) -> I32x4 {
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
unsafe {
let va = i16x8(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
let vb = i16x8(b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]);
let vr = i32x4_dot_i16x8(va, vb);
I32x4::from_array([
i32x4_extract_lane::<0>(vr),
i32x4_extract_lane::<1>(vr),
i32x4_extract_lane::<2>(vr),
i32x4_extract_lane::<3>(vr),
])
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
{
self.fallback.pmaddwd(a, b)
}
}
#[inline]
fn shr_i16x8(&self, v: I16x8, shift: u32) -> I16x8 {
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
unsafe {
let vv = i16x8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
let s = shift.min(15);
let vr = i16x8_shr(vv, s);
I16x8::from_array([
i16x8_extract_lane::<0>(vr),
i16x8_extract_lane::<1>(vr),
i16x8_extract_lane::<2>(vr),
i16x8_extract_lane::<3>(vr),
i16x8_extract_lane::<4>(vr),
i16x8_extract_lane::<5>(vr),
i16x8_extract_lane::<6>(vr),
i16x8_extract_lane::<7>(vr),
])
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
{
self.fallback.shr_i16x8(v, shift)
}
}
#[inline]
fn shl_i16x8(&self, v: I16x8, shift: u32) -> I16x8 {
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
unsafe {
let vv = i16x8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
let s = shift.min(15);
let vr = i16x8_shl(vv, s);
I16x8::from_array([
i16x8_extract_lane::<0>(vr),
i16x8_extract_lane::<1>(vr),
i16x8_extract_lane::<2>(vr),
i16x8_extract_lane::<3>(vr),
i16x8_extract_lane::<4>(vr),
i16x8_extract_lane::<5>(vr),
i16x8_extract_lane::<6>(vr),
i16x8_extract_lane::<7>(vr),
])
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
{
self.fallback.shl_i16x8(v, shift)
}
}
#[inline]
fn shr_i32x4(&self, v: I32x4, shift: u32) -> I32x4 {
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
unsafe {
let vv = i32x4(v[0], v[1], v[2], v[3]);
let s = shift.min(31);
let vr = i32x4_shr(vv, s);
I32x4::from_array([
i32x4_extract_lane::<0>(vr),
i32x4_extract_lane::<1>(vr),
i32x4_extract_lane::<2>(vr),
i32x4_extract_lane::<3>(vr),
])
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
{
self.fallback.shr_i32x4(v, shift)
}
}
#[inline]
fn shl_i32x4(&self, v: I32x4, shift: u32) -> I32x4 {
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
unsafe {
let vv = i32x4(v[0], v[1], v[2], v[3]);
let s = shift.min(31);
let vr = i32x4_shl(vv, s);
I32x4::from_array([
i32x4_extract_lane::<0>(vr),
i32x4_extract_lane::<1>(vr),
i32x4_extract_lane::<2>(vr),
i32x4_extract_lane::<3>(vr),
])
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
{
self.fallback.shl_i32x4(v, shift)
}
}
#[inline]
fn avg_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
unsafe {
let va = u8x16(
a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], a[9], a[10], a[11],
a[12], a[13], a[14], a[15],
);
let vb = u8x16(
b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7], b[8], b[9], b[10], b[11],
b[12], b[13], b[14], b[15],
);
let vr = u8x16_avgr(va, vb);
U8x16::from_array([
u8x16_extract_lane::<0>(vr),
u8x16_extract_lane::<1>(vr),
u8x16_extract_lane::<2>(vr),
u8x16_extract_lane::<3>(vr),
u8x16_extract_lane::<4>(vr),
u8x16_extract_lane::<5>(vr),
u8x16_extract_lane::<6>(vr),
u8x16_extract_lane::<7>(vr),
u8x16_extract_lane::<8>(vr),
u8x16_extract_lane::<9>(vr),
u8x16_extract_lane::<10>(vr),
u8x16_extract_lane::<11>(vr),
u8x16_extract_lane::<12>(vr),
u8x16_extract_lane::<13>(vr),
u8x16_extract_lane::<14>(vr),
u8x16_extract_lane::<15>(vr),
])
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
{
self.fallback.avg_u8x16(a, b)
}
}
}
impl SimdOpsExt for WasmSimd {
#[inline]
fn load4_u8_to_i16x8(&self, src: &[u8]) -> I16x8 {
self.fallback.load4_u8_to_i16x8(src)
}
#[inline]
fn load8_u8_to_i16x8(&self, src: &[u8]) -> I16x8 {
self.fallback.load8_u8_to_i16x8(src)
}
#[inline]
fn store4_i16x8_as_u8(&self, v: I16x8, dst: &mut [u8]) {
self.fallback.store4_i16x8_as_u8(v, dst);
}
#[inline]
fn store8_i16x8_as_u8(&self, v: I16x8, dst: &mut [u8]) {
self.fallback.store8_i16x8_as_u8(v, dst);
}
#[inline]
fn transpose_4x4_i16(&self, rows: &[I16x8; 4]) -> [I16x8; 4] {
self.fallback.transpose_4x4_i16(rows)
}
#[inline]
fn transpose_8x8_i16(&self, rows: &[I16x8; 8]) -> [I16x8; 8] {
self.fallback.transpose_8x8_i16(rows)
}
#[inline]
fn butterfly_i16x8(&self, a: I16x8, b: I16x8) -> (I16x8, I16x8) {
self.fallback.butterfly_i16x8(a, b)
}
#[inline]
fn butterfly_i32x4(&self, a: I32x4, b: I32x4) -> (I32x4, I32x4) {
self.fallback.butterfly_i32x4(a, b)
}
}
pub fn yuv420_4x4_to_rgba(
y_block: &[u8; 16],
cb_block: &[u8; 4],
cr_block: &[u8; 4],
) -> [u8; 64] {
let mut rgba = [0u8; 64];
for row in 0..4usize {
let cb_row = row / 2;
for col in 0..4usize {
let cb_col = col / 2;
let uv_idx = cb_row * 2 + cb_col;
let y = i32::from(y_block[row * 4 + col]);
let cb = i32::from(cb_block[uv_idx]) - 128;
let cr = i32::from(cr_block[uv_idx]) - 128;
let r = (y + (1_402 * cr) / 1_000).clamp(0, 255) as u8;
let g = (y - (344_136 * cb + 714_136 * cr) / 1_000_000).clamp(0, 255) as u8;
let b = (y + (1_772 * cb) / 1_000).clamp(0, 255) as u8;
let out_off = (row * 4 + col) * 4;
rgba[out_off] = r;
rgba[out_off + 1] = g;
rgba[out_off + 2] = b;
rgba[out_off + 3] = 255;
}
}
rgba
}
pub fn rgba_to_yuv420_4x4(rgba: &[u8]) -> Option<([u8; 16], [u8; 4], [u8; 4])> {
if rgba.len() < 64 {
return None;
}
let mut y_block = [0u8; 16];
let mut cb_sum = [0i32; 4];
let mut cr_sum = [0i32; 4];
let mut uv_count = [0i32; 4];
for row in 0..4usize {
for col in 0..4usize {
let off = (row * 4 + col) * 4;
let r = i32::from(rgba[off]);
let g = i32::from(rgba[off + 1]);
let b = i32::from(rgba[off + 2]);
let y = (299 * r + 587 * g + 114 * b) / 1_000;
let cb = (-168_736 * r - 331_264 * g + 500_000 * b) / 1_000_000 + 128;
let cr = (500_000 * r - 418_688 * g - 81_312 * b) / 1_000_000 + 128;
y_block[row * 4 + col] = y.clamp(0, 255) as u8;
let uv_idx = (row / 2) * 2 + (col / 2);
cb_sum[uv_idx] += cb;
cr_sum[uv_idx] += cr;
uv_count[uv_idx] += 1;
}
}
let mut cb_block = [0u8; 4];
let mut cr_block = [0u8; 4];
for i in 0..4 {
let count = uv_count[i].max(1);
cb_block[i] = (cb_sum[i] / count).clamp(0, 255) as u8;
cr_block[i] = (cr_sum[i] / count).clamp(0, 255) as u8;
}
Some((y_block, cb_block, cr_block))
}
#[cfg(test)]
mod tests {
use super::*;
use crate::simd::traits::SimdOps;
use crate::simd::types::{I16x8, I32x4, U8x16};
fn simd() -> WasmSimd {
WasmSimd::new()
}
#[test]
fn test_wasm_simd_name_is_non_empty() {
let s = simd();
assert!(!s.name().is_empty());
}
#[test]
fn test_wasm_simd_info_detect() {
let info = WasmSimdInfo::detect();
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
assert!(!info.simd128_available);
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
assert!(info.simd128_available);
let _ = info; }
#[test]
fn test_add_i16x8() {
let s = simd();
let a = I16x8::from_array([1, 2, 3, 4, 5, 6, 7, 8]);
let b = I16x8::from_array([10, 20, 30, 40, 50, 60, 70, 80]);
let c = s.add_i16x8(a, b);
for i in 0..8 {
assert_eq!(c[i], a[i] + b[i]);
}
}
#[test]
fn test_sub_i16x8() {
let s = simd();
let a = I16x8::from_array([100, 90, 80, 70, 60, 50, 40, 30]);
let b = I16x8::from_array([10, 20, 30, 40, 50, 40, 30, 20]);
let c = s.sub_i16x8(a, b);
for i in 0..8 {
assert_eq!(c[i], a[i] - b[i]);
}
}
#[test]
fn test_mul_i16x8() {
let s = simd();
let a = I16x8::from_array([2, 3, 4, 5, 6, 7, 8, 9]);
let b = I16x8::from_array([3, 3, 3, 3, 3, 3, 3, 3]);
let c = s.mul_i16x8(a, b);
for i in 0..8 {
assert_eq!(c[i], a[i].wrapping_mul(b[i]));
}
}
#[test]
fn test_sad_u8x16_identical() {
let s = simd();
let a = U8x16::from_array([50u8; 16]);
let b = U8x16::from_array([50u8; 16]);
assert_eq!(s.sad_u8x16(a, b), 0);
}
#[test]
fn test_sad_u8x16_non_zero() {
let s = simd();
let a = U8x16::from_array([10u8; 16]);
let b = U8x16::from_array([20u8; 16]);
assert_eq!(s.sad_u8x16(a, b), 16 * 10);
}
#[test]
fn test_min_max_i16x8() {
let s = simd();
let a = I16x8::from_array([1, -1, 3, -3, 5, -5, 7, -7]);
let b = I16x8::from_array([0, 0, 0, 0, 0, 0, 0, 0]);
let mn = s.min_i16x8(a, b);
let mx = s.max_i16x8(a, b);
for i in 0..8 {
assert_eq!(mn[i], a[i].min(b[i]));
assert_eq!(mx[i], a[i].max(b[i]));
}
}
#[test]
fn test_horizontal_sum_i32x4() {
let s = simd();
let v = I32x4::from_array([1, 2, 3, 4]);
assert_eq!(s.horizontal_sum_i32x4(v), 10);
}
#[test]
fn test_pmaddwd() {
let s = simd();
let a = I16x8::from_array([1, 2, 3, 4, 5, 6, 7, 8]);
let b = I16x8::from_array([1, 1, 1, 1, 1, 1, 1, 1]);
let r = s.pmaddwd(a, b);
assert_eq!(r[0], 3);
assert_eq!(r[1], 7);
assert_eq!(r[2], 11);
assert_eq!(r[3], 15);
}
#[test]
fn test_yuv420_4x4_to_rgba_round_trip_approx() {
let y_block = [235u8; 16];
let cb_block = [128u8; 4]; let cr_block = [128u8; 4];
let rgba = yuv420_4x4_to_rgba(&y_block, &cb_block, &cr_block);
for i in 0..16 {
let off = i * 4;
assert!(rgba[off] > 200, "R too low at pixel {i}");
assert!(rgba[off + 1] > 200, "G too low at pixel {i}");
assert!(rgba[off + 2] > 200, "B too low at pixel {i}");
assert_eq!(rgba[off + 3], 255, "alpha should be 255");
}
}
#[test]
fn test_rgba_to_yuv420_4x4_none_on_short_slice() {
assert!(rgba_to_yuv420_4x4(&[0u8; 63]).is_none());
}
#[test]
fn test_rgba_to_yuv420_4x4_neutral_grey() {
let rgba = vec![128u8, 128, 128, 255].repeat(16);
let (y_block, cb_block, cr_block) = rgba_to_yuv420_4x4(&rgba).unwrap();
for &y in &y_block {
assert!((y as i32 - 128).abs() < 5, "Y should be ≈128, got {y}");
}
for &cb in &cb_block {
assert!((cb as i32 - 128).abs() < 5, "Cb should be ≈128, got {cb}");
}
for &cr in &cr_block {
assert!((cr as i32 - 128).abs() < 5, "Cr should be ≈128, got {cr}");
}
}
}