#![cfg(target_arch = "wasm32")]
#![allow(clippy::cast_possible_truncation)]
use core::arch::wasm32::{
i8x16_shuffle, i64x2_add, i64x2_mul, i64x2_shuffle, u64x2_shl, u64x2_shr, u64x2_splat, v128, v128_and, v128_load,
v128_or, v128_store, v128_xor,
};
use super::BLOCK_WORDS;
#[target_feature(enable = "simd128")]
pub(super) unsafe fn compress_simd128(
dst: &mut [u64; BLOCK_WORDS],
x: &[u64; BLOCK_WORDS],
y: &[u64; BLOCK_WORDS],
xor_into: bool,
) {
unsafe {
let mut r = [0u64; BLOCK_WORDS];
let mut q = [0u64; BLOCK_WORDS];
let mut i = 0;
while i < BLOCK_WORDS {
let xv = v128_load(x.as_ptr().add(i).cast());
let yv = v128_load(y.as_ptr().add(i).cast());
let rv = v128_xor(xv, yv);
v128_store(r.as_mut_ptr().add(i).cast(), rv);
v128_store(q.as_mut_ptr().add(i).cast(), rv);
i += 2;
}
let mut row = 0usize;
while row < 8 {
let base = row * 16;
let mut a_lo = v128_load(q.as_ptr().add(base).cast());
let mut a_hi = v128_load(q.as_ptr().add(base + 2).cast());
let mut b_lo = v128_load(q.as_ptr().add(base + 4).cast());
let mut b_hi = v128_load(q.as_ptr().add(base + 6).cast());
let mut c_lo = v128_load(q.as_ptr().add(base + 8).cast());
let mut c_hi = v128_load(q.as_ptr().add(base + 10).cast());
let mut d_lo = v128_load(q.as_ptr().add(base + 12).cast());
let mut d_hi = v128_load(q.as_ptr().add(base + 14).cast());
p_round(
&mut a_lo, &mut a_hi, &mut b_lo, &mut b_hi, &mut c_lo, &mut c_hi, &mut d_lo, &mut d_hi,
);
v128_store(q.as_mut_ptr().add(base).cast(), a_lo);
v128_store(q.as_mut_ptr().add(base + 2).cast(), a_hi);
v128_store(q.as_mut_ptr().add(base + 4).cast(), b_lo);
v128_store(q.as_mut_ptr().add(base + 6).cast(), b_hi);
v128_store(q.as_mut_ptr().add(base + 8).cast(), c_lo);
v128_store(q.as_mut_ptr().add(base + 10).cast(), c_hi);
v128_store(q.as_mut_ptr().add(base + 12).cast(), d_lo);
v128_store(q.as_mut_ptr().add(base + 14).cast(), d_hi);
row += 1;
}
let mut col = 0usize;
while col < 8 {
let base = col * 2;
let mut a_lo = v128_load(q.as_ptr().add(base).cast());
let mut a_hi = v128_load(q.as_ptr().add(base + 16).cast());
let mut b_lo = v128_load(q.as_ptr().add(base + 32).cast());
let mut b_hi = v128_load(q.as_ptr().add(base + 48).cast());
let mut c_lo = v128_load(q.as_ptr().add(base + 64).cast());
let mut c_hi = v128_load(q.as_ptr().add(base + 80).cast());
let mut d_lo = v128_load(q.as_ptr().add(base + 96).cast());
let mut d_hi = v128_load(q.as_ptr().add(base + 112).cast());
p_round(
&mut a_lo, &mut a_hi, &mut b_lo, &mut b_hi, &mut c_lo, &mut c_hi, &mut d_lo, &mut d_hi,
);
v128_store(q.as_mut_ptr().add(base).cast(), a_lo);
v128_store(q.as_mut_ptr().add(base + 16).cast(), a_hi);
v128_store(q.as_mut_ptr().add(base + 32).cast(), b_lo);
v128_store(q.as_mut_ptr().add(base + 48).cast(), b_hi);
v128_store(q.as_mut_ptr().add(base + 64).cast(), c_lo);
v128_store(q.as_mut_ptr().add(base + 80).cast(), c_hi);
v128_store(q.as_mut_ptr().add(base + 96).cast(), d_lo);
v128_store(q.as_mut_ptr().add(base + 112).cast(), d_hi);
col += 1;
}
let mut i = 0;
while i < BLOCK_WORDS {
let qv = v128_load(q.as_ptr().add(i).cast());
let rv = v128_load(r.as_ptr().add(i).cast());
let f = v128_xor(qv, rv);
if xor_into {
let cur = v128_load(dst.as_ptr().add(i).cast());
v128_store(dst.as_mut_ptr().add(i).cast(), v128_xor(cur, f));
} else {
v128_store(dst.as_mut_ptr().add(i).cast(), f);
}
i += 2;
}
}
}
#[inline(always)]
#[allow(clippy::too_many_arguments)]
fn p_round(
a_lo: &mut v128,
a_hi: &mut v128,
b_lo: &mut v128,
b_hi: &mut v128,
c_lo: &mut v128,
c_hi: &mut v128,
d_lo: &mut v128,
d_hi: &mut v128,
) {
gb(a_lo, a_hi, b_lo, b_hi, c_lo, c_hi, d_lo, d_hi);
let tb_lo = *b_lo;
let tb_hi = *b_hi;
*b_lo = i64x2_shuffle::<1, 2>(tb_lo, tb_hi);
*b_hi = i64x2_shuffle::<1, 2>(tb_hi, tb_lo);
core::mem::swap(c_lo, c_hi);
let td_lo = *d_lo;
let td_hi = *d_hi;
*d_lo = i64x2_shuffle::<1, 2>(td_hi, td_lo);
*d_hi = i64x2_shuffle::<1, 2>(td_lo, td_hi);
gb(a_lo, a_hi, b_lo, b_hi, c_lo, c_hi, d_lo, d_hi);
let tb_lo = *b_lo;
let tb_hi = *b_hi;
*b_lo = i64x2_shuffle::<1, 2>(tb_hi, tb_lo);
*b_hi = i64x2_shuffle::<1, 2>(tb_lo, tb_hi);
core::mem::swap(c_lo, c_hi);
let td_lo = *d_lo;
let td_hi = *d_hi;
*d_lo = i64x2_shuffle::<1, 2>(td_lo, td_hi);
*d_hi = i64x2_shuffle::<1, 2>(td_hi, td_lo);
}
#[inline(always)]
#[allow(clippy::too_many_arguments)]
fn gb(
a_lo: &mut v128,
a_hi: &mut v128,
b_lo: &mut v128,
b_hi: &mut v128,
c_lo: &mut v128,
c_hi: &mut v128,
d_lo: &mut v128,
d_hi: &mut v128,
) {
let p_lo = bla_mul(*a_lo, *b_lo);
let p_hi = bla_mul(*a_hi, *b_hi);
*a_lo = i64x2_add(i64x2_add(*a_lo, *b_lo), p_lo);
*a_hi = i64x2_add(i64x2_add(*a_hi, *b_hi), p_hi);
*d_lo = ror32(v128_xor(*d_lo, *a_lo));
*d_hi = ror32(v128_xor(*d_hi, *a_hi));
let p_lo = bla_mul(*c_lo, *d_lo);
let p_hi = bla_mul(*c_hi, *d_hi);
*c_lo = i64x2_add(i64x2_add(*c_lo, *d_lo), p_lo);
*c_hi = i64x2_add(i64x2_add(*c_hi, *d_hi), p_hi);
*b_lo = ror24(v128_xor(*b_lo, *c_lo));
*b_hi = ror24(v128_xor(*b_hi, *c_hi));
let p_lo = bla_mul(*a_lo, *b_lo);
let p_hi = bla_mul(*a_hi, *b_hi);
*a_lo = i64x2_add(i64x2_add(*a_lo, *b_lo), p_lo);
*a_hi = i64x2_add(i64x2_add(*a_hi, *b_hi), p_hi);
*d_lo = ror16(v128_xor(*d_lo, *a_lo));
*d_hi = ror16(v128_xor(*d_hi, *a_hi));
let p_lo = bla_mul(*c_lo, *d_lo);
let p_hi = bla_mul(*c_hi, *d_hi);
*c_lo = i64x2_add(i64x2_add(*c_lo, *d_lo), p_lo);
*c_hi = i64x2_add(i64x2_add(*c_hi, *d_hi), p_hi);
*b_lo = ror63(v128_xor(*b_lo, *c_lo));
*b_hi = ror63(v128_xor(*b_hi, *c_hi));
}
#[inline(always)]
fn bla_mul(a: v128, b: v128) -> v128 {
let mask = u64x2_splat(0xffff_ffff);
let al = v128_and(a, mask);
let bl = v128_and(b, mask);
u64x2_shl(i64x2_mul(al, bl), 1)
}
#[inline(always)]
fn ror32(x: v128) -> v128 {
i8x16_shuffle::<4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11>(x, x)
}
#[inline(always)]
fn ror24(x: v128) -> v128 {
v128_or(u64x2_shr(x, 24), u64x2_shl(x, 40))
}
#[inline(always)]
fn ror16(x: v128) -> v128 {
i8x16_shuffle::<2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9>(x, x)
}
#[inline(always)]
fn ror63(x: v128) -> v128 {
v128_or(u64x2_shr(x, 63), u64x2_shl(x, 1))
}