#[cfg(all(target_arch = "aarch64", not(feature = "force-scalar")))]
pub mod neon_backend {
use crate::{Backend, BLOCKSIZE, ROUNDS, CubeHashParams, rounds_for_rev};
use std::convert::TryInto;
use core::arch::aarch64::*;
#[inline(always)]
unsafe fn load_le_u32x4_reversed(bytes: &[u8]) -> uint32x4_t {
let a = u32::from_le_bytes(bytes[12..16].try_into().unwrap());
let b = u32::from_le_bytes(bytes[8..12].try_into().unwrap());
let c = u32::from_le_bytes(bytes[4..8].try_into().unwrap());
let d = u32::from_le_bytes(bytes[0..4].try_into().unwrap());
vld1q_u32([a, b, c, d].as_ptr())
}
#[inline(always)] unsafe fn to_bytes_d_c_b_a(v: uint32x4_t) -> [u8; 16] {
let mut arr = [0u32; 4]; vst1q_u32(arr.as_mut_ptr(), v);
let bytes = [arr[3].to_le_bytes(), arr[2].to_le_bytes(), arr[1].to_le_bytes(), arr[0].to_le_bytes()];
let mut out = [0u8; 16];
out[0..4].copy_from_slice(&bytes[0]);
out[4..8].copy_from_slice(&bytes[1]);
out[8..12].copy_from_slice(&bytes[2]);
out[12..16].copy_from_slice(&bytes[3]);
out
}
#[inline(always)] unsafe fn permute_badc(v: uint32x4_t) -> uint32x4_t { vrev64q_u32(v) }
#[inline(always)] unsafe fn permute_cdab(v: uint32x4_t) -> uint32x4_t { vextq_u32::<2>(v, v) }
pub struct NEON {
x0: uint32x4_t, x1: uint32x4_t, x2: uint32x4_t, x3: uint32x4_t,
x4: uint32x4_t, x5: uint32x4_t, x6: uint32x4_t, x7: uint32x4_t
}
impl NEON {
#[inline(always)]
unsafe fn rounds(&mut self) {
let mut y0: uint32x4_t;
let mut y1: uint32x4_t;
let mut y2: uint32x4_t;
let mut y3: uint32x4_t;
for _ in 0..ROUNDS {
self.x4 = vaddq_u32(self.x0, permute_badc(self.x4));
self.x5 = vaddq_u32(self.x1, permute_badc(self.x5));
self.x6 = vaddq_u32(self.x2, permute_badc(self.x6));
self.x7 = vaddq_u32(self.x3, permute_badc(self.x7));
y0 = self.x2; y1 = self.x3; y2 = self.x0; y3 = self.x1;
self.x0 = veorq_u32(vshlq_n_u32::<7>(y0), vshrq_n_u32::<25>(y0));
self.x1 = veorq_u32(vshlq_n_u32::<7>(y1), vshrq_n_u32::<25>(y1));
self.x2 = veorq_u32(vshlq_n_u32::<7>(y2), vshrq_n_u32::<25>(y2));
self.x3 = veorq_u32(vshlq_n_u32::<7>(y3), vshrq_n_u32::<25>(y3));
self.x0 = veorq_u32(self.x0, self.x4);
self.x1 = veorq_u32(self.x1, self.x5);
self.x2 = veorq_u32(self.x2, self.x6);
self.x3 = veorq_u32(self.x3, self.x7);
self.x4 = vaddq_u32(self.x0, permute_cdab(self.x4));
self.x5 = vaddq_u32(self.x1, permute_cdab(self.x5));
self.x6 = vaddq_u32(self.x2, permute_cdab(self.x6));
self.x7 = vaddq_u32(self.x3, permute_cdab(self.x7));
y0 = self.x1; y1 = self.x0; y2 = self.x3; y3 = self.x2;
self.x0 = veorq_u32(vshlq_n_u32::<11>(y0), vshrq_n_u32::<21>(y0));
self.x1 = veorq_u32(vshlq_n_u32::<11>(y1), vshrq_n_u32::<21>(y1));
self.x2 = veorq_u32(vshlq_n_u32::<11>(y2), vshrq_n_u32::<21>(y2));
self.x3 = veorq_u32(vshlq_n_u32::<11>(y3), vshrq_n_u32::<21>(y3));
self.x0 = veorq_u32(self.x0, self.x4);
self.x1 = veorq_u32(self.x1, self.x5);
self.x2 = veorq_u32(self.x2, self.x6);
self.x3 = veorq_u32(self.x3, self.x7);
}
}
}
impl Backend for NEON {
fn new(params: CubeHashParams) -> Self {
unsafe {
let (irounds, _frounds) = rounds_for_rev(params.revision);
let mut st = NEON {
x0: vld1q_u32([0u32, ROUNDS as u32, BLOCKSIZE as u32, (params.hash_len_bits / 8) as u32].as_ptr()),
x1: vdupq_n_u32(0),
x2: vdupq_n_u32(0),
x3: vdupq_n_u32(0),
x4: vdupq_n_u32(0),
x5: vdupq_n_u32(0),
x6: vdupq_n_u32(0),
x7: vdupq_n_u32(0),
};
for _ in 0..(irounds / ROUNDS) { st.rounds(); }
st
}
}
fn absorb_block(&mut self, block32: &[u8]) {
unsafe {
debug_assert_eq!(block32.len(), BLOCKSIZE);
self.x0 = veorq_u32(self.x0, load_le_u32x4_reversed(&block32[..16]));
self.x1 = veorq_u32(self.x1, load_le_u32x4_reversed(&block32[16..32]));
self.rounds();
}
}
fn set_finalize_flag(&mut self) {
unsafe { self.x7 = veorq_u32(self.x7, vld1q_u32([0u32, 1, 0, 0].as_ptr())); }
}
fn rounds_only(&mut self) { unsafe { self.rounds(); } }
fn output_full(&self) -> [u8; 64] {
unsafe {
let mut out = [0u8; 64];
out[0..16].copy_from_slice(&to_bytes_d_c_b_a(self.x0));
out[16..32].copy_from_slice(&to_bytes_d_c_b_a(self.x1));
out[32..48].copy_from_slice(&to_bytes_d_c_b_a(self.x2));
out[48..64].copy_from_slice(&to_bytes_d_c_b_a(self.x3));
out
}
}
}
}
#[cfg(all(target_arch = "aarch64", not(feature = "force-scalar")))]
pub use neon_backend::NEON;