#![cfg(target_arch = "aarch64")]
use crate::crc32::fusion::aarch64::{clmul_hi, clmul_lo, clmul_scalar};
use core::arch::aarch64::{
__crc32cb, __crc32cd, __crc32cw, uint64x2_t, veor3q_u64, veorq_u64, vgetq_lane_u64, vld1q_u64,
vmov_n_u64, vmull_p8, vreinterpret_p8_u64, vreinterpretq_u64_p16,
};
#[inline]
#[target_feature(enable = "crc,aes,sha3")]
pub(crate) unsafe fn crc32_iscsi_eor3_v9s3x2e_s3(
mut crc0: u32,
mut buf: *const u8,
mut len: usize,
) -> u32 {
while len > 0 && (buf as usize & 7) != 0 {
crc0 = __crc32cb(crc0, *buf);
buf = buf.add(1);
len -= 1;
}
if (buf as usize & 8) != 0 && len >= 8 {
crc0 = __crc32cd(crc0, *(buf as *const u64));
buf = buf.add(8);
len -= 8;
}
if len >= 192 {
let end = buf.add(len);
let blk = len / 192;
let klen = blk * 16;
let buf2 = buf.add(klen * 3);
let limit = buf.add(klen).sub(32);
let mut crc1 = 0u32;
let mut crc2 = 0u32;
let mut x0 = vld1q_u64(buf2 as *const u64);
let mut x1 = vld1q_u64(buf2.add(16) as *const u64);
let mut x2 = vld1q_u64(buf2.add(32) as *const u64);
let mut x3 = vld1q_u64(buf2.add(48) as *const u64);
let mut x4 = vld1q_u64(buf2.add(64) as *const u64);
let mut x5 = vld1q_u64(buf2.add(80) as *const u64);
let mut x6 = vld1q_u64(buf2.add(96) as *const u64);
let mut x7 = vld1q_u64(buf2.add(112) as *const u64);
let mut x8 = vld1q_u64(buf2.add(128) as *const u64);
let k_vals: [u64; 2] = [0x7e908048, 0xc96cfdc0];
let mut k = vld1q_u64(k_vals.as_ptr());
let mut buf2 = buf2.add(144);
while buf <= limit {
let y0 = clmul_lo(x0, k);
x0 = clmul_hi(x0, k);
let y1 = clmul_lo(x1, k);
x1 = clmul_hi(x1, k);
let y2 = clmul_lo(x2, k);
x2 = clmul_hi(x2, k);
let y3 = clmul_lo(x3, k);
x3 = clmul_hi(x3, k);
let y4 = clmul_lo(x4, k);
x4 = clmul_hi(x4, k);
let y5 = clmul_lo(x5, k);
x5 = clmul_hi(x5, k);
let y6 = clmul_lo(x6, k);
x6 = clmul_hi(x6, k);
let y7 = clmul_lo(x7, k);
x7 = clmul_hi(x7, k);
let y8 = clmul_lo(x8, k);
x8 = clmul_hi(x8, k);
x0 = veor3q_u64(x0, y0, vld1q_u64(buf2 as *const u64));
x1 = veor3q_u64(x1, y1, vld1q_u64(buf2.add(16) as *const u64));
x2 = veor3q_u64(x2, y2, vld1q_u64(buf2.add(32) as *const u64));
x3 = veor3q_u64(x3, y3, vld1q_u64(buf2.add(48) as *const u64));
x4 = veor3q_u64(x4, y4, vld1q_u64(buf2.add(64) as *const u64));
x5 = veor3q_u64(x5, y5, vld1q_u64(buf2.add(80) as *const u64));
x6 = veor3q_u64(x6, y6, vld1q_u64(buf2.add(96) as *const u64));
x7 = veor3q_u64(x7, y7, vld1q_u64(buf2.add(112) as *const u64));
x8 = veor3q_u64(x8, y8, vld1q_u64(buf2.add(128) as *const u64));
crc0 = __crc32cd(crc0, *(buf as *const u64));
crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64));
crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64));
crc0 = __crc32cd(crc0, *(buf.add(8) as *const u64));
crc1 = __crc32cd(crc1, *(buf.add(klen + 8) as *const u64));
crc2 = __crc32cd(crc2, *(buf.add(klen * 2 + 8) as *const u64));
buf = buf.add(16);
buf2 = buf2.add(144);
}
let k_vals: [u64; 2] = [0xf20c0dfe, 0x493c7d27];
k = vld1q_u64(k_vals.as_ptr());
let y0 = clmul_lo(x0, k);
x0 = clmul_hi(x0, k);
x0 = veor3q_u64(x0, y0, x1);
x1 = x2;
x2 = x3;
x3 = x4;
x4 = x5;
x5 = x6;
x6 = x7;
x7 = x8;
let y0 = clmul_lo(x0, k);
x0 = clmul_hi(x0, k);
let y2 = clmul_lo(x2, k);
x2 = clmul_hi(x2, k);
let y4 = clmul_lo(x4, k);
x4 = clmul_hi(x4, k);
let y6 = clmul_lo(x6, k);
x6 = clmul_hi(x6, k);
x0 = veor3q_u64(x0, y0, x1);
x2 = veor3q_u64(x2, y2, x3);
x4 = veor3q_u64(x4, y4, x5);
x6 = veor3q_u64(x6, y6, x7);
let k_vals: [u64; 2] = [0x3da6d0cb, 0xba4fc28e];
k = vld1q_u64(k_vals.as_ptr());
let y0 = clmul_lo(x0, k);
x0 = clmul_hi(x0, k);
let y4 = clmul_lo(x4, k);
x4 = clmul_hi(x4, k);
x0 = veor3q_u64(x0, y0, x2);
x4 = veor3q_u64(x4, y4, x6);
let k_vals: [u64; 2] = [0x740eef02, 0x9e4addf8];
k = vld1q_u64(k_vals.as_ptr());
let y0 = clmul_lo(x0, k);
x0 = clmul_hi(x0, k);
x0 = veor3q_u64(x0, y0, x4);
crc0 = __crc32cd(crc0, *(buf as *const u64));
crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64));
crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64));
crc0 = __crc32cd(crc0, *(buf.add(8) as *const u64));
crc1 = __crc32cd(crc1, *(buf.add(klen + 8) as *const u64));
crc2 = __crc32cd(crc2, *(buf.add(klen * 2 + 8) as *const u64));
let vc0 = crc_shift_iscsi(crc0, klen * 2 + blk * 144);
let vc1 = crc_shift_iscsi(crc1, klen + blk * 144);
let vc2 = crc_shift_iscsi(crc2, blk * 144);
let vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0);
crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
crc0 = __crc32cd(crc0, vc ^ vgetq_lane_u64(x0, 1));
buf = buf2;
len = end.offset_from(buf) as usize;
}
if len >= 32 {
let klen = ((len - 8) / 24) * 8;
let mut crc1 = 0u32;
let mut crc2 = 0u32;
loop {
crc0 = __crc32cd(crc0, *(buf as *const u64));
crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64));
crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64));
buf = buf.add(8);
len -= 24;
if len < 32 {
break;
}
}
let vc0 = crc_shift_iscsi(crc0, klen * 2 + 8);
let vc1 = crc_shift_iscsi(crc1, klen + 8);
let vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0);
buf = buf.add(klen * 2);
crc0 = crc2;
crc0 = __crc32cd(crc0, *(buf as *const u64) ^ vc);
buf = buf.add(8);
len -= 8;
}
while len >= 8 {
crc0 = __crc32cd(crc0, *(buf as *const u64));
buf = buf.add(8);
len -= 8;
}
while len > 0 {
crc0 = __crc32cb(crc0, *buf);
buf = buf.add(1);
len -= 1;
}
crc0
}
#[inline]
#[target_feature(enable = "aes")]
unsafe fn crc_shift_iscsi(crc: u32, nbytes: usize) -> uint64x2_t {
clmul_scalar(crc, xnmodp_crc32_iscsi((nbytes * 8 - 33) as u64))
}
#[inline]
#[target_feature(enable = "crc,aes")]
unsafe fn xnmodp_crc32_iscsi(mut n: u64) -> u32 {
let mut stack = !1u64;
let mut acc: u32;
let mut low: u32;
while n > 191 {
stack = (stack << 1) + (n & 1);
n = (n >> 1) - 16;
}
stack = !stack;
acc = 0x80000000u32 >> (n & 31);
n >>= 5;
while n > 0 {
acc = __crc32cw(acc, 0);
n -= 1;
}
while {
low = (stack & 1) as u32;
stack >>= 1;
stack != 0
} {
let x = vreinterpret_p8_u64(vmov_n_u64(acc as u64));
let squared = vmull_p8(x, x);
let y = vgetq_lane_u64(vreinterpretq_u64_p16(squared), 0);
acc = __crc32cd(0, y << low);
}
acc
}