#![allow(unsafe_code)]
#![allow(clippy::indexing_slicing)]
use core::{
arch::aarch64::*,
ops::{BitXor, BitXorAssign},
};
use super::keys::{
CRC16_CCITT_KEYS_REFLECTED, CRC16_CCITT_STREAM_REFLECTED, CRC16_IBM_KEYS_REFLECTED, CRC16_IBM_STREAM_REFLECTED,
};
#[repr(transparent)]
#[derive(Copy, Clone)]
struct Simd(uint8x16_t);
impl BitXor for Simd {
type Output = Self;
#[inline]
fn bitxor(self, other: Self) -> Self {
unsafe { Self(veorq_u8(self.0, other.0)) }
}
}
impl BitXorAssign for Simd {
#[inline]
fn bitxor_assign(&mut self, other: Self) {
*self = *self ^ other;
}
}
impl Simd {
#[inline]
#[target_feature(enable = "neon")]
unsafe fn new(high: u64, low: u64) -> Self {
Self(vcombine_u8(vcreate_u8(low), vcreate_u8(high)))
}
#[inline]
#[target_feature(enable = "neon")]
unsafe fn load(ptr: *const u8) -> Self {
unsafe { Self(vld1q_u8(ptr)) }
}
#[inline]
#[target_feature(enable = "neon")]
unsafe fn and(self, mask: Self) -> Self {
Self(vandq_u8(self.0, mask.0))
}
#[inline]
#[target_feature(enable = "neon")]
unsafe fn shift_right_8(self) -> Self {
Self(vextq_u8(self.0, vdupq_n_u8(0), 8))
}
#[inline]
#[target_feature(enable = "neon")]
unsafe fn shift_left_12(self) -> Self {
let low_32 = vgetq_lane_u32(vreinterpretq_u32_u8(self.0), 0);
let result = vsetq_lane_u32(low_32, vdupq_n_u32(0), 3);
Self(vreinterpretq_u8_u32(result))
}
#[inline]
#[target_feature(enable = "aes")]
unsafe fn clmul00(self, other: Self) -> Self {
let a = vgetq_lane_p64(vreinterpretq_p64_u8(self.0), 0);
let b = vgetq_lane_p64(vreinterpretq_p64_u8(other.0), 0);
Self(vreinterpretq_u8_p128(vmull_p64(a, b)))
}
#[inline]
#[target_feature(enable = "aes")]
unsafe fn clmul01(self, other: Self) -> Self {
let a = vgetq_lane_p64(vreinterpretq_p64_u8(self.0), 1);
let b = vgetq_lane_p64(vreinterpretq_p64_u8(other.0), 0);
Self(vreinterpretq_u8_p128(vmull_p64(a, b)))
}
#[inline]
#[target_feature(enable = "aes")]
unsafe fn clmul10(self, other: Self) -> Self {
let a = vgetq_lane_p64(vreinterpretq_p64_u8(self.0), 0);
let b = vgetq_lane_p64(vreinterpretq_p64_u8(other.0), 1);
Self(vreinterpretq_u8_p128(vmull_p64(a, b)))
}
#[inline]
#[target_feature(enable = "aes")]
unsafe fn clmul11(self, other: Self) -> Self {
let a = vgetq_lane_p64(vreinterpretq_p64_u8(self.0), 1);
let b = vgetq_lane_p64(vreinterpretq_p64_u8(other.0), 1);
Self(vreinterpretq_u8_p128(vmull_p64(a, b)))
}
#[inline]
#[target_feature(enable = "aes")]
unsafe fn fold_16_reflected(self, coeff: Self, data_to_xor: Self) -> Self {
unsafe {
let h = self.clmul10(coeff);
let l = self.clmul01(coeff);
h ^ l ^ data_to_xor
}
}
#[inline]
#[target_feature(enable = "aes", enable = "neon")]
unsafe fn fold_width32_reflected(self, high: u64, low: u64) -> Self {
unsafe {
let coeff_low = Self::new(0, low);
let coeff_high = Self::new(high, 0);
let clmul = self.clmul00(coeff_low);
let shifted = self.shift_right_8();
let mut state = clmul ^ shifted;
let mask2 = Self::new(0xFFFF_FFFF_FFFF_FFFF, 0xFFFF_FFFF_0000_0000);
let masked = state.and(mask2);
let shifted = state.shift_left_12();
let clmul = shifted.clmul11(coeff_high);
state = clmul ^ masked;
state
}
}
#[inline]
#[target_feature(enable = "aes", enable = "neon")]
unsafe fn barrett_width32_reflected(self, poly: u64, mu: u64) -> u32 {
unsafe {
let polymu = Self::new(poly, mu);
let clmul1 = self.clmul00(polymu);
let clmul2 = clmul1.clmul10(polymu);
let xorred = self ^ clmul2;
let hi = xorred.shift_right_8();
vgetq_lane_u32(vreinterpretq_u32_u8(hi.0), 0)
}
}
}
#[inline]
#[target_feature(enable = "aes", enable = "neon")]
unsafe fn finalize_lanes_width32_reflected(x: [Simd; 8], keys: &[u64; 23]) -> u32 {
unsafe {
let mut res = x[7];
res = x[0].fold_16_reflected(Simd::new(keys[10], keys[9]), res);
res = x[1].fold_16_reflected(Simd::new(keys[12], keys[11]), res);
res = x[2].fold_16_reflected(Simd::new(keys[14], keys[13]), res);
res = x[3].fold_16_reflected(Simd::new(keys[16], keys[15]), res);
res = x[4].fold_16_reflected(Simd::new(keys[18], keys[17]), res);
res = x[5].fold_16_reflected(Simd::new(keys[20], keys[19]), res);
res = x[6].fold_16_reflected(Simd::new(keys[2], keys[1]), res);
res = res.fold_width32_reflected(keys[6], keys[5]);
res.barrett_width32_reflected(keys[8], keys[7])
}
}
#[inline]
#[target_feature(enable = "aes", enable = "neon")]
unsafe fn fold_block_128_width32_reflected(x: &mut [Simd; 8], chunk: &[Simd; 8], coeff: Simd) {
unsafe {
x[0] = x[0].fold_16_reflected(coeff, chunk[0]);
x[1] = x[1].fold_16_reflected(coeff, chunk[1]);
x[2] = x[2].fold_16_reflected(coeff, chunk[2]);
x[3] = x[3].fold_16_reflected(coeff, chunk[3]);
x[4] = x[4].fold_16_reflected(coeff, chunk[4]);
x[5] = x[5].fold_16_reflected(coeff, chunk[5]);
x[6] = x[6].fold_16_reflected(coeff, chunk[6]);
x[7] = x[7].fold_16_reflected(coeff, chunk[7]);
}
}
#[inline]
#[cfg(all(not(miri), any(target_os = "linux", target_os = "android")))]
#[target_feature(enable = "aes", enable = "neon", enable = "sha3")]
unsafe fn fold_lane_width32_reflected_eor3(x: Simd, coeff: Simd, data_to_xor: Simd) -> Simd {
unsafe {
let h = x.clmul10(coeff);
let l = x.clmul01(coeff);
Simd(veor3q_u8(data_to_xor.0, h.0, l.0))
}
}
#[inline]
#[cfg(all(not(miri), any(target_os = "linux", target_os = "android")))]
#[target_feature(enable = "aes", enable = "neon", enable = "sha3")]
unsafe fn fold_block_128_width32_reflected_eor3(x: &mut [Simd; 8], chunk: &[Simd; 8], coeff: Simd) {
unsafe {
x[0] = fold_lane_width32_reflected_eor3(x[0], coeff, chunk[0]);
x[1] = fold_lane_width32_reflected_eor3(x[1], coeff, chunk[1]);
x[2] = fold_lane_width32_reflected_eor3(x[2], coeff, chunk[2]);
x[3] = fold_lane_width32_reflected_eor3(x[3], coeff, chunk[3]);
x[4] = fold_lane_width32_reflected_eor3(x[4], coeff, chunk[4]);
x[5] = fold_lane_width32_reflected_eor3(x[5], coeff, chunk[5]);
x[6] = fold_lane_width32_reflected_eor3(x[6], coeff, chunk[6]);
x[7] = fold_lane_width32_reflected_eor3(x[7], coeff, chunk[7]);
}
}
#[target_feature(enable = "aes", enable = "neon")]
unsafe fn update_simd_width32_reflected_2way(
state: u32,
blocks: &[[Simd; 8]],
fold_256b: (u64, u64),
keys: &[u64; 23],
) -> u32 {
use crate::checksum::common::prefetch::{LARGE_BLOCK_DISTANCE, prefetch_read_l1};
unsafe {
debug_assert!(blocks.len() >= 2);
if blocks.len() < 2 {
core::hint::unreachable_unchecked()
}
let coeff_256b = Simd::new(fold_256b.0, fold_256b.1);
let coeff_128b = Simd::new(keys[4], keys[3]);
let zero = Simd::new(0, 0);
let blocks_ptr = blocks.as_ptr();
let mut s0 = *blocks_ptr;
let mut s1 = *blocks_ptr.add(1);
s0[0] ^= Simd::new(0, state as u64);
const BLOCK_SIZE: usize = 128;
const DOUBLE_GROUP: usize = 4; const PREFETCH_BLOCKS: usize = LARGE_BLOCK_DISTANCE / BLOCK_SIZE;
let blocks_end = blocks_ptr.add(blocks.len());
let mut ptr = blocks_ptr.add(2);
let mut rem = blocks.len() - 2;
while rem >= DOUBLE_GROUP {
let prefetch_ptr = ptr.add(PREFETCH_BLOCKS);
if prefetch_ptr < blocks_end {
prefetch_read_l1(prefetch_ptr.cast::<u8>());
}
fold_block_128_width32_reflected(&mut s0, &*ptr, coeff_256b);
fold_block_128_width32_reflected(&mut s1, &*ptr.add(1), coeff_256b);
fold_block_128_width32_reflected(&mut s0, &*ptr.add(2), coeff_256b);
fold_block_128_width32_reflected(&mut s1, &*ptr.add(3), coeff_256b);
ptr = ptr.add(DOUBLE_GROUP);
rem -= DOUBLE_GROUP;
}
while rem >= 2 {
fold_block_128_width32_reflected(&mut s0, &*ptr, coeff_256b);
fold_block_128_width32_reflected(&mut s1, &*ptr.add(1), coeff_256b);
ptr = ptr.add(2);
rem -= 2;
}
let mut combined = s1;
combined[0] ^= s0[0].fold_16_reflected(coeff_128b, zero);
combined[1] ^= s0[1].fold_16_reflected(coeff_128b, zero);
combined[2] ^= s0[2].fold_16_reflected(coeff_128b, zero);
combined[3] ^= s0[3].fold_16_reflected(coeff_128b, zero);
combined[4] ^= s0[4].fold_16_reflected(coeff_128b, zero);
combined[5] ^= s0[5].fold_16_reflected(coeff_128b, zero);
combined[6] ^= s0[6].fold_16_reflected(coeff_128b, zero);
combined[7] ^= s0[7].fold_16_reflected(coeff_128b, zero);
if rem == 1 {
fold_block_128_width32_reflected(&mut combined, &*ptr, coeff_128b);
}
finalize_lanes_width32_reflected(combined, keys)
}
}
#[target_feature(enable = "aes", enable = "neon")]
unsafe fn update_simd_width32_reflected_3way(
state: u32,
blocks: &[[Simd; 8]],
fold_384b: (u64, u64),
fold_256b: (u64, u64),
keys: &[u64; 23],
) -> u32 {
use crate::checksum::common::prefetch::{LARGE_BLOCK_DISTANCE, prefetch_read_l1};
unsafe {
if blocks.len() < 3 {
let Some((first, rest)) = blocks.split_first() else {
return state;
};
return update_simd_width32_reflected(state, first, rest, keys);
}
let coeff_384b = Simd::new(fold_384b.0, fold_384b.1);
let coeff_256b = Simd::new(fold_256b.0, fold_256b.1);
let coeff_128b = Simd::new(keys[4], keys[3]);
let zero = Simd::new(0, 0);
let mut s0 = blocks[0];
let mut s1 = blocks[1];
let mut s2 = blocks[2];
s0[0] ^= Simd::new(0, state as u64);
const BLOCK_SIZE: usize = 128;
const DOUBLE_GROUP: usize = 6; const PREFETCH_BLOCKS: usize = LARGE_BLOCK_DISTANCE / BLOCK_SIZE;
let blocks_ptr = blocks.as_ptr();
let blocks_end = blocks_ptr.add(blocks.len());
let mut ptr = blocks_ptr.add(3);
let double_end = blocks_ptr.add(3 + ((blocks.len() - 3) / DOUBLE_GROUP) * DOUBLE_GROUP);
while ptr < double_end {
let prefetch_ptr = ptr.add(PREFETCH_BLOCKS);
if prefetch_ptr < blocks_end {
prefetch_read_l1(prefetch_ptr.cast::<u8>());
}
fold_block_128_width32_reflected(&mut s0, &*ptr, coeff_384b);
fold_block_128_width32_reflected(&mut s1, &*ptr.add(1), coeff_384b);
fold_block_128_width32_reflected(&mut s2, &*ptr.add(2), coeff_384b);
fold_block_128_width32_reflected(&mut s0, &*ptr.add(3), coeff_384b);
fold_block_128_width32_reflected(&mut s1, &*ptr.add(4), coeff_384b);
fold_block_128_width32_reflected(&mut s2, &*ptr.add(5), coeff_384b);
ptr = ptr.add(DOUBLE_GROUP);
}
let triple_end = blocks_ptr.add((blocks.len() / 3) * 3);
while ptr < triple_end {
fold_block_128_width32_reflected(&mut s0, &*ptr, coeff_384b);
fold_block_128_width32_reflected(&mut s1, &*ptr.add(1), coeff_384b);
fold_block_128_width32_reflected(&mut s2, &*ptr.add(2), coeff_384b);
ptr = ptr.add(3);
}
let mut combined = s2;
combined[0] ^= s1[0].fold_16_reflected(coeff_128b, zero);
combined[1] ^= s1[1].fold_16_reflected(coeff_128b, zero);
combined[2] ^= s1[2].fold_16_reflected(coeff_128b, zero);
combined[3] ^= s1[3].fold_16_reflected(coeff_128b, zero);
combined[4] ^= s1[4].fold_16_reflected(coeff_128b, zero);
combined[5] ^= s1[5].fold_16_reflected(coeff_128b, zero);
combined[6] ^= s1[6].fold_16_reflected(coeff_128b, zero);
combined[7] ^= s1[7].fold_16_reflected(coeff_128b, zero);
combined[0] ^= s0[0].fold_16_reflected(coeff_256b, zero);
combined[1] ^= s0[1].fold_16_reflected(coeff_256b, zero);
combined[2] ^= s0[2].fold_16_reflected(coeff_256b, zero);
combined[3] ^= s0[3].fold_16_reflected(coeff_256b, zero);
combined[4] ^= s0[4].fold_16_reflected(coeff_256b, zero);
combined[5] ^= s0[5].fold_16_reflected(coeff_256b, zero);
combined[6] ^= s0[6].fold_16_reflected(coeff_256b, zero);
combined[7] ^= s0[7].fold_16_reflected(coeff_256b, zero);
while ptr < blocks_end {
fold_block_128_width32_reflected(&mut combined, &*ptr, coeff_128b);
ptr = ptr.add(1);
}
finalize_lanes_width32_reflected(combined, keys)
}
}
#[inline]
#[target_feature(enable = "aes", enable = "neon")]
unsafe fn update_simd_width32_reflected(state: u32, first: &[Simd; 8], rest: &[[Simd; 8]], keys: &[u64; 23]) -> u32 {
use crate::checksum::common::prefetch::{LARGE_BLOCK_DISTANCE, prefetch_read_l1};
unsafe {
let mut x = *first;
x[0] ^= Simd::new(0, state as u64);
let coeff_128b = Simd::new(keys[4], keys[3]);
const BLOCK_SIZE: usize = 128;
const DOUBLE_GROUP: usize = 2; const PREFETCH_BLOCKS: usize = LARGE_BLOCK_DISTANCE / BLOCK_SIZE;
let rest_ptr = rest.as_ptr();
let rest_end = rest_ptr.add(rest.len());
let mut ptr = rest_ptr;
let double_end = rest_ptr.add((rest.len() / DOUBLE_GROUP) * DOUBLE_GROUP);
while ptr < double_end {
let prefetch_ptr = ptr.add(PREFETCH_BLOCKS);
if prefetch_ptr < rest_end {
prefetch_read_l1(prefetch_ptr.cast::<u8>());
}
let chunk0 = &*ptr;
x[0] = x[0].fold_16_reflected(coeff_128b, chunk0[0]);
x[1] = x[1].fold_16_reflected(coeff_128b, chunk0[1]);
x[2] = x[2].fold_16_reflected(coeff_128b, chunk0[2]);
x[3] = x[3].fold_16_reflected(coeff_128b, chunk0[3]);
x[4] = x[4].fold_16_reflected(coeff_128b, chunk0[4]);
x[5] = x[5].fold_16_reflected(coeff_128b, chunk0[5]);
x[6] = x[6].fold_16_reflected(coeff_128b, chunk0[6]);
x[7] = x[7].fold_16_reflected(coeff_128b, chunk0[7]);
let chunk1 = &*ptr.add(1);
x[0] = x[0].fold_16_reflected(coeff_128b, chunk1[0]);
x[1] = x[1].fold_16_reflected(coeff_128b, chunk1[1]);
x[2] = x[2].fold_16_reflected(coeff_128b, chunk1[2]);
x[3] = x[3].fold_16_reflected(coeff_128b, chunk1[3]);
x[4] = x[4].fold_16_reflected(coeff_128b, chunk1[4]);
x[5] = x[5].fold_16_reflected(coeff_128b, chunk1[5]);
x[6] = x[6].fold_16_reflected(coeff_128b, chunk1[6]);
x[7] = x[7].fold_16_reflected(coeff_128b, chunk1[7]);
ptr = ptr.add(DOUBLE_GROUP);
}
while ptr < rest_end {
let chunk = &*ptr;
x[0] = x[0].fold_16_reflected(coeff_128b, chunk[0]);
x[1] = x[1].fold_16_reflected(coeff_128b, chunk[1]);
x[2] = x[2].fold_16_reflected(coeff_128b, chunk[2]);
x[3] = x[3].fold_16_reflected(coeff_128b, chunk[3]);
x[4] = x[4].fold_16_reflected(coeff_128b, chunk[4]);
x[5] = x[5].fold_16_reflected(coeff_128b, chunk[5]);
x[6] = x[6].fold_16_reflected(coeff_128b, chunk[6]);
x[7] = x[7].fold_16_reflected(coeff_128b, chunk[7]);
ptr = ptr.add(1);
}
finalize_lanes_width32_reflected(x, keys)
}
}
#[inline]
#[cfg(all(not(miri), any(target_os = "linux", target_os = "android")))]
#[target_feature(enable = "aes", enable = "neon", enable = "sha3")]
unsafe fn update_simd_width32_reflected_eor3(
state: u32,
first: &[Simd; 8],
rest: &[[Simd; 8]],
keys: &[u64; 23],
) -> u32 {
use crate::checksum::common::prefetch::{LARGE_BLOCK_DISTANCE, prefetch_read_l1};
unsafe {
let mut x = *first;
x[0] ^= Simd::new(0, state as u64);
let coeff_128b = Simd::new(keys[4], keys[3]);
const BLOCK_SIZE: usize = 128;
const DOUBLE_GROUP: usize = 2; const PREFETCH_BLOCKS: usize = LARGE_BLOCK_DISTANCE / BLOCK_SIZE;
let rest_ptr = rest.as_ptr();
let rest_end = rest_ptr.add(rest.len());
let mut ptr = rest_ptr;
let double_end = rest_ptr.add((rest.len() / DOUBLE_GROUP) * DOUBLE_GROUP);
while ptr < double_end {
let prefetch_ptr = ptr.add(PREFETCH_BLOCKS);
if prefetch_ptr < rest_end {
prefetch_read_l1(prefetch_ptr.cast::<u8>());
}
fold_block_128_width32_reflected_eor3(&mut x, &*ptr, coeff_128b);
fold_block_128_width32_reflected_eor3(&mut x, &*ptr.add(1), coeff_128b);
ptr = ptr.add(DOUBLE_GROUP);
}
while ptr < rest_end {
fold_block_128_width32_reflected_eor3(&mut x, &*ptr, coeff_128b);
ptr = ptr.add(1);
}
finalize_lanes_width32_reflected(x, keys)
}
}
#[cfg(all(not(miri), any(target_os = "linux", target_os = "android")))]
#[target_feature(enable = "aes", enable = "neon", enable = "sha3")]
unsafe fn update_simd_width32_reflected_eor3_2way(
state: u32,
blocks: &[[Simd; 8]],
fold_256b: (u64, u64),
keys: &[u64; 23],
) -> u32 {
use crate::checksum::common::prefetch::{LARGE_BLOCK_DISTANCE, prefetch_read_l1};
unsafe {
debug_assert!(blocks.len() >= 2);
if blocks.len() < 2 {
core::hint::unreachable_unchecked()
}
let coeff_256b = Simd::new(fold_256b.0, fold_256b.1);
let coeff_128b = Simd::new(keys[4], keys[3]);
let blocks_ptr = blocks.as_ptr();
let mut s0 = *blocks_ptr;
let mut s1 = *blocks_ptr.add(1);
s0[0] ^= Simd::new(0, state as u64);
const BLOCK_SIZE: usize = 128;
const DOUBLE_GROUP: usize = 4; const PREFETCH_BLOCKS: usize = LARGE_BLOCK_DISTANCE / BLOCK_SIZE;
let blocks_end = blocks_ptr.add(blocks.len());
let mut ptr = blocks_ptr.add(2);
let mut rem = blocks.len() - 2;
while rem >= DOUBLE_GROUP {
let prefetch_ptr = ptr.add(PREFETCH_BLOCKS);
if prefetch_ptr < blocks_end {
prefetch_read_l1(prefetch_ptr.cast::<u8>());
}
fold_block_128_width32_reflected_eor3(&mut s0, &*ptr, coeff_256b);
fold_block_128_width32_reflected_eor3(&mut s1, &*ptr.add(1), coeff_256b);
fold_block_128_width32_reflected_eor3(&mut s0, &*ptr.add(2), coeff_256b);
fold_block_128_width32_reflected_eor3(&mut s1, &*ptr.add(3), coeff_256b);
ptr = ptr.add(DOUBLE_GROUP);
rem -= DOUBLE_GROUP;
}
while rem >= 2 {
fold_block_128_width32_reflected_eor3(&mut s0, &*ptr, coeff_256b);
fold_block_128_width32_reflected_eor3(&mut s1, &*ptr.add(1), coeff_256b);
ptr = ptr.add(2);
rem -= 2;
}
let mut combined = s1;
combined[0] = fold_lane_width32_reflected_eor3(s0[0], coeff_128b, combined[0]);
combined[1] = fold_lane_width32_reflected_eor3(s0[1], coeff_128b, combined[1]);
combined[2] = fold_lane_width32_reflected_eor3(s0[2], coeff_128b, combined[2]);
combined[3] = fold_lane_width32_reflected_eor3(s0[3], coeff_128b, combined[3]);
combined[4] = fold_lane_width32_reflected_eor3(s0[4], coeff_128b, combined[4]);
combined[5] = fold_lane_width32_reflected_eor3(s0[5], coeff_128b, combined[5]);
combined[6] = fold_lane_width32_reflected_eor3(s0[6], coeff_128b, combined[6]);
combined[7] = fold_lane_width32_reflected_eor3(s0[7], coeff_128b, combined[7]);
if rem == 1 {
fold_block_128_width32_reflected_eor3(&mut combined, &*ptr, coeff_128b);
}
finalize_lanes_width32_reflected(combined, keys)
}
}
#[cfg(all(not(miri), any(target_os = "linux", target_os = "android")))]
#[target_feature(enable = "aes", enable = "neon", enable = "sha3")]
unsafe fn update_simd_width32_reflected_eor3_3way(
state: u32,
blocks: &[[Simd; 8]],
fold_384b: (u64, u64),
fold_256b: (u64, u64),
keys: &[u64; 23],
) -> u32 {
use crate::checksum::common::prefetch::{LARGE_BLOCK_DISTANCE, prefetch_read_l1};
unsafe {
if blocks.len() < 3 {
let Some((first, rest)) = blocks.split_first() else {
return state;
};
return update_simd_width32_reflected_eor3(state, first, rest, keys);
}
let coeff_384b = Simd::new(fold_384b.0, fold_384b.1);
let coeff_256b = Simd::new(fold_256b.0, fold_256b.1);
let coeff_128b = Simd::new(keys[4], keys[3]);
let mut s0 = blocks[0];
let mut s1 = blocks[1];
let mut s2 = blocks[2];
s0[0] ^= Simd::new(0, state as u64);
const BLOCK_SIZE: usize = 128;
const DOUBLE_GROUP: usize = 6; const PREFETCH_BLOCKS: usize = LARGE_BLOCK_DISTANCE / BLOCK_SIZE;
let blocks_ptr = blocks.as_ptr();
let blocks_end = blocks_ptr.add(blocks.len());
let mut ptr = blocks_ptr.add(3);
let double_end = blocks_ptr.add(3 + ((blocks.len() - 3) / DOUBLE_GROUP) * DOUBLE_GROUP);
while ptr < double_end {
let prefetch_ptr = ptr.add(PREFETCH_BLOCKS);
if prefetch_ptr < blocks_end {
prefetch_read_l1(prefetch_ptr.cast::<u8>());
}
fold_block_128_width32_reflected_eor3(&mut s0, &*ptr, coeff_384b);
fold_block_128_width32_reflected_eor3(&mut s1, &*ptr.add(1), coeff_384b);
fold_block_128_width32_reflected_eor3(&mut s2, &*ptr.add(2), coeff_384b);
fold_block_128_width32_reflected_eor3(&mut s0, &*ptr.add(3), coeff_384b);
fold_block_128_width32_reflected_eor3(&mut s1, &*ptr.add(4), coeff_384b);
fold_block_128_width32_reflected_eor3(&mut s2, &*ptr.add(5), coeff_384b);
ptr = ptr.add(DOUBLE_GROUP);
}
let triple_end = blocks_ptr.add((blocks.len() / 3) * 3);
while ptr < triple_end {
fold_block_128_width32_reflected_eor3(&mut s0, &*ptr, coeff_384b);
fold_block_128_width32_reflected_eor3(&mut s1, &*ptr.add(1), coeff_384b);
fold_block_128_width32_reflected_eor3(&mut s2, &*ptr.add(2), coeff_384b);
ptr = ptr.add(3);
}
let mut combined = s2;
combined[0] = fold_lane_width32_reflected_eor3(s1[0], coeff_128b, combined[0]);
combined[1] = fold_lane_width32_reflected_eor3(s1[1], coeff_128b, combined[1]);
combined[2] = fold_lane_width32_reflected_eor3(s1[2], coeff_128b, combined[2]);
combined[3] = fold_lane_width32_reflected_eor3(s1[3], coeff_128b, combined[3]);
combined[4] = fold_lane_width32_reflected_eor3(s1[4], coeff_128b, combined[4]);
combined[5] = fold_lane_width32_reflected_eor3(s1[5], coeff_128b, combined[5]);
combined[6] = fold_lane_width32_reflected_eor3(s1[6], coeff_128b, combined[6]);
combined[7] = fold_lane_width32_reflected_eor3(s1[7], coeff_128b, combined[7]);
combined[0] = fold_lane_width32_reflected_eor3(s0[0], coeff_256b, combined[0]);
combined[1] = fold_lane_width32_reflected_eor3(s0[1], coeff_256b, combined[1]);
combined[2] = fold_lane_width32_reflected_eor3(s0[2], coeff_256b, combined[2]);
combined[3] = fold_lane_width32_reflected_eor3(s0[3], coeff_256b, combined[3]);
combined[4] = fold_lane_width32_reflected_eor3(s0[4], coeff_256b, combined[4]);
combined[5] = fold_lane_width32_reflected_eor3(s0[5], coeff_256b, combined[5]);
combined[6] = fold_lane_width32_reflected_eor3(s0[6], coeff_256b, combined[6]);
combined[7] = fold_lane_width32_reflected_eor3(s0[7], coeff_256b, combined[7]);
while ptr < blocks_end {
fold_block_128_width32_reflected_eor3(&mut combined, &*ptr, coeff_128b);
ptr = ptr.add(1);
}
finalize_lanes_width32_reflected(combined, keys)
}
}
#[inline]
#[target_feature(enable = "aes", enable = "neon")]
unsafe fn crc16_width32_pmull_small(
mut state: u16,
data: &[u8],
keys: &[u64; 23],
portable: fn(u16, &[u8]) -> u16,
) -> u16 {
unsafe {
let mut buf = data.as_ptr();
let mut len = data.len();
if len < 16 {
return portable(state, data);
}
let coeff_16b = Simd::new(keys[2], keys[1]);
let mut x0 = Simd::load(buf);
x0 ^= Simd::new(0, state as u64);
buf = buf.add(16);
len = len.strict_sub(16);
while len >= 16 {
let chunk = Simd::load(buf);
x0 = x0.fold_16_reflected(coeff_16b, chunk);
buf = buf.add(16);
len = len.strict_sub(16);
}
let x0 = x0.fold_width32_reflected(keys[6], keys[5]);
state = x0.barrett_width32_reflected(keys[8], keys[7]) as u16;
let tail = core::slice::from_raw_parts(buf, len);
portable(state, tail)
}
}
#[inline]
#[target_feature(enable = "aes", enable = "neon")]
unsafe fn crc16_width32_pmull(mut state: u16, data: &[u8], keys: &[u64; 23], portable: fn(u16, &[u8]) -> u16) -> u16 {
unsafe {
let (left, middle, right) = data.align_to::<[Simd; 8]>();
let Some((first, rest)) = middle.split_first() else {
return crc16_width32_pmull_small(state, data, keys, portable);
};
state = portable(state, left);
let state32 = update_simd_width32_reflected(state as u32, first, rest, keys);
state = state32 as u16;
portable(state, right)
}
}
#[inline]
#[target_feature(enable = "aes", enable = "neon")]
unsafe fn crc16_width32_pmull_2way(
mut state: u16,
data: &[u8],
keys: &[u64; 23],
fold_256b: (u64, u64),
portable: fn(u16, &[u8]) -> u16,
) -> u16 {
unsafe {
let (left, middle, right) = data.align_to::<[Simd; 8]>();
if middle.is_empty() {
return crc16_width32_pmull_small(state, data, keys, portable);
}
state = portable(state, left);
let state32 = if middle.len() >= 2 {
update_simd_width32_reflected_2way(state as u32, middle, fold_256b, keys)
} else {
let Some((first, rest)) = middle.split_first() else {
return crc16_width32_pmull_small(state, data, keys, portable);
};
update_simd_width32_reflected(state as u32, first, rest, keys)
};
state = state32 as u16;
portable(state, right)
}
}
#[inline]
#[target_feature(enable = "aes", enable = "neon")]
unsafe fn crc16_width32_pmull_3way(
mut state: u16,
data: &[u8],
keys: &[u64; 23],
fold_384b: (u64, u64),
fold_256b: (u64, u64),
portable: fn(u16, &[u8]) -> u16,
) -> u16 {
unsafe {
let (left, middle, right) = data.align_to::<[Simd; 8]>();
if middle.is_empty() {
return crc16_width32_pmull_small(state, data, keys, portable);
}
state = portable(state, left);
let state32 = update_simd_width32_reflected_3way(state as u32, middle, fold_384b, fold_256b, keys);
state = state32 as u16;
portable(state, right)
}
}
#[inline]
#[cfg(all(not(miri), any(target_os = "linux", target_os = "android")))]
#[target_feature(enable = "aes", enable = "neon", enable = "sha3")]
unsafe fn crc16_width32_pmull_eor3(
mut state: u16,
data: &[u8],
keys: &[u64; 23],
portable: fn(u16, &[u8]) -> u16,
) -> u16 {
unsafe {
let (left, middle, right) = data.align_to::<[Simd; 8]>();
let Some((first, rest)) = middle.split_first() else {
return crc16_width32_pmull_small(state, data, keys, portable);
};
state = portable(state, left);
let state32 = update_simd_width32_reflected_eor3(state as u32, first, rest, keys);
state = state32 as u16;
portable(state, right)
}
}
#[inline]
#[cfg(all(not(miri), any(target_os = "linux", target_os = "android")))]
#[target_feature(enable = "aes", enable = "neon", enable = "sha3")]
unsafe fn crc16_width32_pmull_eor3_2way(
mut state: u16,
data: &[u8],
keys: &[u64; 23],
fold_256b: (u64, u64),
portable: fn(u16, &[u8]) -> u16,
) -> u16 {
unsafe {
let (left, middle, right) = data.align_to::<[Simd; 8]>();
if middle.is_empty() {
return crc16_width32_pmull_small(state, data, keys, portable);
}
state = portable(state, left);
let state32 = if middle.len() >= 2 {
update_simd_width32_reflected_eor3_2way(state as u32, middle, fold_256b, keys)
} else {
let Some((first, rest)) = middle.split_first() else {
return crc16_width32_pmull_small(state, data, keys, portable);
};
update_simd_width32_reflected_eor3(state as u32, first, rest, keys)
};
state = state32 as u16;
portable(state, right)
}
}
#[inline]
#[cfg(all(not(miri), any(target_os = "linux", target_os = "android")))]
#[target_feature(enable = "aes", enable = "neon", enable = "sha3")]
unsafe fn crc16_width32_pmull_eor3_3way(
mut state: u16,
data: &[u8],
keys: &[u64; 23],
fold_384b: (u64, u64),
fold_256b: (u64, u64),
portable: fn(u16, &[u8]) -> u16,
) -> u16 {
unsafe {
let (left, middle, right) = data.align_to::<[Simd; 8]>();
if middle.is_empty() {
return crc16_width32_pmull_small(state, data, keys, portable);
}
state = portable(state, left);
let state32 = update_simd_width32_reflected_eor3_3way(state as u32, middle, fold_384b, fold_256b, keys);
state = state32 as u16;
portable(state, right)
}
}
#[inline]
pub fn crc16_ccitt_pmull_safe(crc: u16, data: &[u8]) -> u16 {
unsafe {
crc16_width32_pmull(
crc,
data,
&CRC16_CCITT_KEYS_REFLECTED,
super::portable::crc16_ccitt_slice8,
)
}
}
#[inline]
pub fn crc16_ccitt_pmull_small_safe(crc: u16, data: &[u8]) -> u16 {
unsafe {
crc16_width32_pmull_small(
crc,
data,
&CRC16_CCITT_KEYS_REFLECTED,
super::portable::crc16_ccitt_slice8,
)
}
}
#[inline]
pub fn crc16_ccitt_pmull_2way_safe(crc: u16, data: &[u8]) -> u16 {
unsafe {
crc16_width32_pmull_2way(
crc,
data,
&CRC16_CCITT_KEYS_REFLECTED,
CRC16_CCITT_STREAM_REFLECTED.fold_256b,
super::portable::crc16_ccitt_slice8,
)
}
}
#[inline]
pub fn crc16_ccitt_pmull_3way_safe(crc: u16, data: &[u8]) -> u16 {
unsafe {
crc16_width32_pmull_3way(
crc,
data,
&CRC16_CCITT_KEYS_REFLECTED,
CRC16_CCITT_STREAM_REFLECTED.fold_384b,
CRC16_CCITT_STREAM_REFLECTED.fold_256b,
super::portable::crc16_ccitt_slice8,
)
}
}
#[inline]
#[cfg(all(not(miri), any(target_os = "linux", target_os = "android")))]
pub fn crc16_ccitt_pmull_eor3_safe(crc: u16, data: &[u8]) -> u16 {
unsafe {
crc16_width32_pmull_eor3(
crc,
data,
&CRC16_CCITT_KEYS_REFLECTED,
super::portable::crc16_ccitt_slice8,
)
}
}
#[inline]
#[cfg(all(not(miri), any(target_os = "linux", target_os = "android")))]
pub fn crc16_ccitt_pmull_eor3_2way_safe(crc: u16, data: &[u8]) -> u16 {
unsafe {
crc16_width32_pmull_eor3_2way(
crc,
data,
&CRC16_CCITT_KEYS_REFLECTED,
CRC16_CCITT_STREAM_REFLECTED.fold_256b,
super::portable::crc16_ccitt_slice8,
)
}
}
#[inline]
#[cfg(all(not(miri), any(target_os = "linux", target_os = "android")))]
pub fn crc16_ccitt_pmull_eor3_3way_safe(crc: u16, data: &[u8]) -> u16 {
unsafe {
crc16_width32_pmull_eor3_3way(
crc,
data,
&CRC16_CCITT_KEYS_REFLECTED,
CRC16_CCITT_STREAM_REFLECTED.fold_384b,
CRC16_CCITT_STREAM_REFLECTED.fold_256b,
super::portable::crc16_ccitt_slice8,
)
}
}
#[inline]
pub fn crc16_ibm_pmull_safe(crc: u16, data: &[u8]) -> u16 {
unsafe { crc16_width32_pmull(crc, data, &CRC16_IBM_KEYS_REFLECTED, super::portable::crc16_ibm_slice8) }
}
#[inline]
pub fn crc16_ibm_pmull_small_safe(crc: u16, data: &[u8]) -> u16 {
unsafe { crc16_width32_pmull_small(crc, data, &CRC16_IBM_KEYS_REFLECTED, super::portable::crc16_ibm_slice8) }
}
#[inline]
pub fn crc16_ibm_pmull_2way_safe(crc: u16, data: &[u8]) -> u16 {
unsafe {
crc16_width32_pmull_2way(
crc,
data,
&CRC16_IBM_KEYS_REFLECTED,
CRC16_IBM_STREAM_REFLECTED.fold_256b,
super::portable::crc16_ibm_slice8,
)
}
}
#[inline]
pub fn crc16_ibm_pmull_3way_safe(crc: u16, data: &[u8]) -> u16 {
unsafe {
crc16_width32_pmull_3way(
crc,
data,
&CRC16_IBM_KEYS_REFLECTED,
CRC16_IBM_STREAM_REFLECTED.fold_384b,
CRC16_IBM_STREAM_REFLECTED.fold_256b,
super::portable::crc16_ibm_slice8,
)
}
}
#[inline]
#[cfg(all(not(miri), any(target_os = "linux", target_os = "android")))]
pub fn crc16_ibm_pmull_eor3_safe(crc: u16, data: &[u8]) -> u16 {
unsafe { crc16_width32_pmull_eor3(crc, data, &CRC16_IBM_KEYS_REFLECTED, super::portable::crc16_ibm_slice8) }
}
#[inline]
#[cfg(all(not(miri), any(target_os = "linux", target_os = "android")))]
pub fn crc16_ibm_pmull_eor3_2way_safe(crc: u16, data: &[u8]) -> u16 {
unsafe {
crc16_width32_pmull_eor3_2way(
crc,
data,
&CRC16_IBM_KEYS_REFLECTED,
CRC16_IBM_STREAM_REFLECTED.fold_256b,
super::portable::crc16_ibm_slice8,
)
}
}
#[inline]
#[cfg(all(not(miri), any(target_os = "linux", target_os = "android")))]
pub fn crc16_ibm_pmull_eor3_3way_safe(crc: u16, data: &[u8]) -> u16 {
unsafe {
crc16_width32_pmull_eor3_3way(
crc,
data,
&CRC16_IBM_KEYS_REFLECTED,
CRC16_IBM_STREAM_REFLECTED.fold_384b,
CRC16_IBM_STREAM_REFLECTED.fold_256b,
super::portable::crc16_ibm_slice8,
)
}
}