#![allow(unsafe_code)]
#![allow(dead_code)] #![allow(clippy::indexing_slicing)]
use core::{
arch::x86_64::*,
ops::{BitXor, BitXorAssign},
};
use crate::checksum::common::clmul::{CRC64_NVME_STREAM, CRC64_XZ_STREAM, Crc64ClmulConstants};
#[repr(transparent)]
#[derive(Copy, Clone, Debug)]
struct Simd(__m128i);
impl BitXor for Simd {
type Output = Self;
#[inline]
fn bitxor(self, other: Self) -> Self {
unsafe { Self(_mm_xor_si128(self.0, other.0)) }
}
}
impl BitXorAssign for Simd {
#[inline]
fn bitxor_assign(&mut self, other: Self) {
*self = *self ^ other;
}
}
impl Simd {
#[inline]
#[target_feature(enable = "sse2")]
unsafe fn new(high: u64, low: u64) -> Self {
Self(_mm_set_epi64x(high.cast_signed(), low.cast_signed()))
}
#[inline]
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
unsafe fn fold_16(self, coeff: Self) -> Self {
let h = _mm_clmulepi64_si128::<0x11>(self.0, coeff.0);
let l = _mm_clmulepi64_si128::<0x00>(self.0, coeff.0);
Self(_mm_xor_si128(h, l))
}
#[inline]
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
unsafe fn fold_8(self, coeff: u64) -> Self {
let coeff = unsafe { Self::new(0, coeff) };
let h = _mm_clmulepi64_si128::<0x00>(self.0, coeff.0);
let l = _mm_srli_si128::<8>(self.0);
Self(_mm_xor_si128(h, l))
}
#[inline]
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
unsafe fn barrett(self, poly: u64, mu: u64) -> u64 {
unsafe {
let polymu = Self::new(poly, mu);
let t1 = _mm_clmulepi64_si128::<0x00>(self.0, polymu.0);
let h = _mm_slli_si128::<8>(t1);
let l = _mm_clmulepi64_si128::<0x10>(t1, polymu.0);
let reduced = Self(_mm_xor_si128(_mm_xor_si128(h, l), self.0));
let hi = _mm_srli_si128::<8>(reduced.0);
_mm_cvtsi128_si64(hi).cast_unsigned()
}
}
}
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
unsafe fn update_simd(state: u64, first: &[Simd; 8], rest: &[[Simd; 8]], consts: &Crc64ClmulConstants) -> u64 {
unsafe {
let mut x = *first;
x[0] ^= Simd::new(0, state);
let coeff = Simd::new(consts.fold_128b.0, consts.fold_128b.1);
for chunk in rest {
let t0 = x[0].fold_16(coeff);
let t1 = x[1].fold_16(coeff);
let t2 = x[2].fold_16(coeff);
let t3 = x[3].fold_16(coeff);
let t4 = x[4].fold_16(coeff);
let t5 = x[5].fold_16(coeff);
let t6 = x[6].fold_16(coeff);
let t7 = x[7].fold_16(coeff);
x[0] = chunk[0] ^ t0;
x[1] = chunk[1] ^ t1;
x[2] = chunk[2] ^ t2;
x[3] = chunk[3] ^ t3;
x[4] = chunk[4] ^ t4;
x[5] = chunk[5] ^ t5;
x[6] = chunk[6] ^ t6;
x[7] = chunk[7] ^ t7;
}
fold_tail(x, consts)
}
}
#[inline(always)]
unsafe fn fold_tail(x: [Simd; 8], consts: &Crc64ClmulConstants) -> u64 {
unsafe {
let c0 = Simd::new(consts.tail_fold_16b[0].0, consts.tail_fold_16b[0].1);
let c1 = Simd::new(consts.tail_fold_16b[1].0, consts.tail_fold_16b[1].1);
let c2 = Simd::new(consts.tail_fold_16b[2].0, consts.tail_fold_16b[2].1);
let c3 = Simd::new(consts.tail_fold_16b[3].0, consts.tail_fold_16b[3].1);
let c4 = Simd::new(consts.tail_fold_16b[4].0, consts.tail_fold_16b[4].1);
let c5 = Simd::new(consts.tail_fold_16b[5].0, consts.tail_fold_16b[5].1);
let c6 = Simd::new(consts.tail_fold_16b[6].0, consts.tail_fold_16b[6].1);
let mut acc = x[7];
acc ^= x[0].fold_16(c0);
acc ^= x[1].fold_16(c1);
acc ^= x[2].fold_16(c2);
acc ^= x[3].fold_16(c3);
acc ^= x[4].fold_16(c4);
acc ^= x[5].fold_16(c5);
acc ^= x[6].fold_16(c6);
acc.fold_8(consts.fold_8b).barrett(consts.poly, consts.mu)
}
}
#[inline]
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
unsafe fn fold_block_128(x: &mut [Simd; 8], chunk: &[Simd; 8], coeff: Simd) {
unsafe {
let t0 = x[0].fold_16(coeff);
let t1 = x[1].fold_16(coeff);
let t2 = x[2].fold_16(coeff);
let t3 = x[3].fold_16(coeff);
let t4 = x[4].fold_16(coeff);
let t5 = x[5].fold_16(coeff);
let t6 = x[6].fold_16(coeff);
let t7 = x[7].fold_16(coeff);
x[0] = chunk[0] ^ t0;
x[1] = chunk[1] ^ t1;
x[2] = chunk[2] ^ t2;
x[3] = chunk[3] ^ t3;
x[4] = chunk[4] ^ t4;
x[5] = chunk[5] ^ t5;
x[6] = chunk[6] ^ t6;
x[7] = chunk[7] ^ t7;
}
}
#[inline(always)]
unsafe fn merge_lanes_xor(acc: &mut [Simd; 8], stream: &[Simd; 8], coeff: Simd) {
unsafe {
acc[0] ^= stream[0].fold_16(coeff);
acc[1] ^= stream[1].fold_16(coeff);
acc[2] ^= stream[2].fold_16(coeff);
acc[3] ^= stream[3].fold_16(coeff);
acc[4] ^= stream[4].fold_16(coeff);
acc[5] ^= stream[5].fold_16(coeff);
acc[6] ^= stream[6].fold_16(coeff);
acc[7] ^= stream[7].fold_16(coeff);
}
}
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
unsafe fn update_simd_2way(
state: u64,
blocks: &[[Simd; 8]],
fold_256b: (u64, u64),
consts: &Crc64ClmulConstants,
) -> u64 {
unsafe {
use crate::checksum::common::prefetch::{LARGE_BLOCK_DISTANCE, prefetch_read_l1};
debug_assert!(blocks.len() >= 2);
let coeff_256 = Simd::new(fold_256b.0, fold_256b.1);
let coeff_128 = Simd::new(consts.fold_128b.0, consts.fold_128b.1);
let mut s0 = blocks[0];
let mut s1 = blocks[1];
s0[0] ^= Simd::new(0, state);
let mut i = 2;
let even = blocks.len() & !1usize;
let double_even = 2usize.strict_add(((blocks.len().strict_sub(2)) / 4).strict_mul(4));
while i < double_even {
let prefetch_ptr = blocks.as_ptr().add(i).cast::<u8>();
prefetch_read_l1(prefetch_ptr.wrapping_add(LARGE_BLOCK_DISTANCE));
fold_block_128(&mut s0, &blocks[i], coeff_256);
fold_block_128(&mut s1, &blocks[i + 1], coeff_256);
fold_block_128(&mut s0, &blocks[i + 2], coeff_256);
fold_block_128(&mut s1, &blocks[i + 3], coeff_256);
i = i.strict_add(4);
}
while i < even {
fold_block_128(&mut s0, &blocks[i], coeff_256);
fold_block_128(&mut s1, &blocks[i + 1], coeff_256);
i = i.strict_add(2);
}
let mut combined = s1;
merge_lanes_xor(&mut combined, &s0, coeff_128);
if even != blocks.len() {
fold_block_128(&mut combined, &blocks[even], coeff_128);
}
fold_tail(combined, consts)
}
}
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
unsafe fn update_simd_4way(
state: u64,
blocks: &[[Simd; 8]],
fold_512b: (u64, u64),
combine: &[(u64, u64); 3],
consts: &Crc64ClmulConstants,
) -> u64 {
unsafe {
use crate::checksum::common::prefetch::{LARGE_BLOCK_DISTANCE, prefetch_read_l1};
debug_assert!(!blocks.is_empty());
if blocks.len() < 4 {
let Some((first, rest)) = blocks.split_first() else {
return state;
};
return update_simd(state, first, rest, consts);
}
let aligned = (blocks.len() / 4) * 4;
let double_aligned = 4usize.strict_add(((blocks.len().strict_sub(4)) / 8).strict_mul(8));
let coeff_512 = Simd::new(fold_512b.0, fold_512b.1);
let coeff_128 = Simd::new(consts.fold_128b.0, consts.fold_128b.1);
let c384 = Simd::new(combine[0].0, combine[0].1);
let c256 = Simd::new(combine[1].0, combine[1].1);
let c128 = Simd::new(combine[2].0, combine[2].1);
let mut s0 = blocks[0];
let mut s1 = blocks[1];
let mut s2 = blocks[2];
let mut s3 = blocks[3];
s0[0] ^= Simd::new(0, state);
let mut i = 4;
while i < double_aligned {
let prefetch_ptr = blocks.as_ptr().add(i).cast::<u8>();
prefetch_read_l1(prefetch_ptr.wrapping_add(LARGE_BLOCK_DISTANCE));
fold_block_128(&mut s0, &blocks[i], coeff_512);
fold_block_128(&mut s1, &blocks[i + 1], coeff_512);
fold_block_128(&mut s2, &blocks[i + 2], coeff_512);
fold_block_128(&mut s3, &blocks[i + 3], coeff_512);
fold_block_128(&mut s0, &blocks[i + 4], coeff_512);
fold_block_128(&mut s1, &blocks[i + 5], coeff_512);
fold_block_128(&mut s2, &blocks[i + 6], coeff_512);
fold_block_128(&mut s3, &blocks[i + 7], coeff_512);
i = i.strict_add(8);
}
while i < aligned {
fold_block_128(&mut s0, &blocks[i], coeff_512);
fold_block_128(&mut s1, &blocks[i + 1], coeff_512);
fold_block_128(&mut s2, &blocks[i + 2], coeff_512);
fold_block_128(&mut s3, &blocks[i + 3], coeff_512);
i = i.strict_add(4);
}
let mut combined = s3;
combined[0] ^= s2[0].fold_16(c128);
combined[1] ^= s2[1].fold_16(c128);
combined[2] ^= s2[2].fold_16(c128);
combined[3] ^= s2[3].fold_16(c128);
combined[4] ^= s2[4].fold_16(c128);
combined[5] ^= s2[5].fold_16(c128);
combined[6] ^= s2[6].fold_16(c128);
combined[7] ^= s2[7].fold_16(c128);
combined[0] ^= s1[0].fold_16(c256);
combined[1] ^= s1[1].fold_16(c256);
combined[2] ^= s1[2].fold_16(c256);
combined[3] ^= s1[3].fold_16(c256);
combined[4] ^= s1[4].fold_16(c256);
combined[5] ^= s1[5].fold_16(c256);
combined[6] ^= s1[6].fold_16(c256);
combined[7] ^= s1[7].fold_16(c256);
combined[0] ^= s0[0].fold_16(c384);
combined[1] ^= s0[1].fold_16(c384);
combined[2] ^= s0[2].fold_16(c384);
combined[3] ^= s0[3].fold_16(c384);
combined[4] ^= s0[4].fold_16(c384);
combined[5] ^= s0[5].fold_16(c384);
combined[6] ^= s0[6].fold_16(c384);
combined[7] ^= s0[7].fold_16(c384);
for block in &blocks[aligned..] {
fold_block_128(&mut combined, block, coeff_128);
}
fold_tail(combined, consts)
}
}
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
unsafe fn update_simd_7way(
state: u64,
blocks: &[[Simd; 8]],
fold_896b: (u64, u64),
combine: &[(u64, u64); 6],
consts: &Crc64ClmulConstants,
) -> u64 {
unsafe {
use crate::checksum::common::prefetch::{LARGE_BLOCK_DISTANCE, prefetch_read_l1};
debug_assert!(!blocks.is_empty());
if blocks.len() < 7 {
let Some((first, rest)) = blocks.split_first() else {
return state;
};
return update_simd(state, first, rest, consts);
}
let aligned = (blocks.len() / 7) * 7;
let coeff_896 = Simd::new(fold_896b.0, fold_896b.1);
let coeff_128 = Simd::new(consts.fold_128b.0, consts.fold_128b.1);
let c768 = Simd::new(combine[0].0, combine[0].1);
let c512 = Simd::new(combine[2].0, combine[2].1);
let c256 = Simd::new(combine[4].0, combine[4].1);
let c128 = Simd::new(combine[5].0, combine[5].1);
let mut s0 = blocks[0];
let mut s1 = blocks[1];
let mut s2 = blocks[2];
let mut s3 = blocks[3];
let mut s4 = blocks[4];
let mut s5 = blocks[5];
let mut s6 = blocks[6];
s0[0] ^= Simd::new(0, state);
let mut i = 7;
while i < aligned {
let prefetch_ptr = blocks.as_ptr().add(i).cast::<u8>();
prefetch_read_l1(prefetch_ptr.wrapping_add(LARGE_BLOCK_DISTANCE));
fold_block_128(&mut s0, &blocks[i], coeff_896);
fold_block_128(&mut s1, &blocks[i + 1], coeff_896);
fold_block_128(&mut s2, &blocks[i + 2], coeff_896);
fold_block_128(&mut s3, &blocks[i + 3], coeff_896);
fold_block_128(&mut s4, &blocks[i + 4], coeff_896);
fold_block_128(&mut s5, &blocks[i + 5], coeff_896);
fold_block_128(&mut s6, &blocks[i + 6], coeff_896);
i = i.strict_add(7);
}
let mut combined = s6;
merge_lanes_xor(&mut combined, &s5, c128);
merge_lanes_xor(&mut s4, &s3, c128);
merge_lanes_xor(&mut combined, &s4, c256);
merge_lanes_xor(&mut s2, &s1, c128);
merge_lanes_xor(&mut combined, &s2, c512);
merge_lanes_xor(&mut combined, &s0, c768);
for block in &blocks[aligned..] {
fold_block_128(&mut combined, block, coeff_128);
}
fold_tail(combined, consts)
}
}
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
unsafe fn update_simd_8way(
state: u64,
blocks: &[[Simd; 8]],
fold_1024b: (u64, u64),
combine: &[(u64, u64); 7],
consts: &Crc64ClmulConstants,
) -> u64 {
unsafe {
use crate::checksum::common::prefetch::{LARGE_BLOCK_DISTANCE, prefetch_read_l1};
debug_assert!(!blocks.is_empty());
if blocks.len() < 8 {
let Some((first, rest)) = blocks.split_first() else {
return state;
};
return update_simd(state, first, rest, consts);
}
let aligned = (blocks.len() / 8) * 8;
let coeff_1024 = Simd::new(fold_1024b.0, fold_1024b.1);
let coeff_128 = Simd::new(consts.fold_128b.0, consts.fold_128b.1);
let c768 = Simd::new(combine[1].0, combine[1].1);
let c512 = Simd::new(combine[3].0, combine[3].1);
let c256 = Simd::new(combine[5].0, combine[5].1);
let c128 = Simd::new(combine[6].0, combine[6].1);
let mut s0 = blocks[0];
let mut s1 = blocks[1];
let mut s2 = blocks[2];
let mut s3 = blocks[3];
let mut s4 = blocks[4];
let mut s5 = blocks[5];
let mut s6 = blocks[6];
let mut s7 = blocks[7];
s0[0] ^= Simd::new(0, state);
let mut i = 8;
while i < aligned {
let prefetch_ptr = blocks.as_ptr().add(i).cast::<u8>();
prefetch_read_l1(prefetch_ptr.wrapping_add(LARGE_BLOCK_DISTANCE));
fold_block_128(&mut s0, &blocks[i], coeff_1024);
fold_block_128(&mut s1, &blocks[i + 1], coeff_1024);
fold_block_128(&mut s2, &blocks[i + 2], coeff_1024);
fold_block_128(&mut s3, &blocks[i + 3], coeff_1024);
fold_block_128(&mut s4, &blocks[i + 4], coeff_1024);
fold_block_128(&mut s5, &blocks[i + 5], coeff_1024);
fold_block_128(&mut s6, &blocks[i + 6], coeff_1024);
fold_block_128(&mut s7, &blocks[i + 7], coeff_1024);
i = i.strict_add(8);
}
let mut combined = s7;
merge_lanes_xor(&mut combined, &s6, c128);
merge_lanes_xor(&mut s5, &s4, c128);
merge_lanes_xor(&mut combined, &s5, c256);
merge_lanes_xor(&mut s3, &s2, c128);
merge_lanes_xor(&mut combined, &s3, c512);
merge_lanes_xor(&mut s1, &s0, c128);
merge_lanes_xor(&mut combined, &s1, c768);
for block in &blocks[aligned..] {
fold_block_128(&mut combined, block, coeff_128);
}
fold_tail(combined, consts)
}
}
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
unsafe fn crc64_pclmul(mut state: u64, bytes: &[u8], consts: &Crc64ClmulConstants, tables: &[[u64; 256]; 8]) -> u64 {
unsafe {
let (left, middle, right) = bytes.align_to::<[Simd; 8]>();
if let Some((first, rest)) = middle.split_first() {
state = super::portable::crc64_slice8(state, left, tables);
state = update_simd(state, first, rest, consts);
super::portable::crc64_slice8(state, right, tables)
} else {
super::portable::crc64_slice8(state, bytes, tables)
}
}
}
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
unsafe fn crc64_pclmul_2way(
mut state: u64,
bytes: &[u8],
fold_256b: (u64, u64),
consts: &Crc64ClmulConstants,
tables: &[[u64; 256]; 8],
) -> u64 {
unsafe {
let (left, middle, right) = bytes.align_to::<[Simd; 8]>();
if middle.is_empty() {
return super::portable::crc64_slice8(state, bytes, tables);
}
state = super::portable::crc64_slice8(state, left, tables);
if middle.len() >= 2 {
state = update_simd_2way(state, middle, fold_256b, consts);
} else if let Some((first, rest)) = middle.split_first() {
state = update_simd(state, first, rest, consts);
}
super::portable::crc64_slice8(state, right, tables)
}
}
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
unsafe fn crc64_pclmul_4way(
mut state: u64,
bytes: &[u8],
fold_512b: (u64, u64),
combine: &[(u64, u64); 3],
consts: &Crc64ClmulConstants,
tables: &[[u64; 256]; 8],
) -> u64 {
unsafe {
let (left, middle, right) = bytes.align_to::<[Simd; 8]>();
if middle.is_empty() {
return super::portable::crc64_slice8(state, bytes, tables);
}
state = super::portable::crc64_slice8(state, left, tables);
state = update_simd_4way(state, middle, fold_512b, combine, consts);
super::portable::crc64_slice8(state, right, tables)
}
}
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
unsafe fn crc64_pclmul_7way(
mut state: u64,
bytes: &[u8],
fold_896b: (u64, u64),
combine: &[(u64, u64); 6],
consts: &Crc64ClmulConstants,
tables: &[[u64; 256]; 8],
) -> u64 {
unsafe {
let (left, middle, right) = bytes.align_to::<[Simd; 8]>();
if middle.is_empty() {
return super::portable::crc64_slice8(state, bytes, tables);
}
state = super::portable::crc64_slice8(state, left, tables);
state = update_simd_7way(state, middle, fold_896b, combine, consts);
super::portable::crc64_slice8(state, right, tables)
}
}
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
unsafe fn crc64_pclmul_8way(
mut state: u64,
bytes: &[u8],
fold_1024b: (u64, u64),
combine: &[(u64, u64); 7],
consts: &Crc64ClmulConstants,
tables: &[[u64; 256]; 8],
) -> u64 {
unsafe {
let (left, middle, right) = bytes.align_to::<[Simd; 8]>();
if middle.is_empty() {
return super::portable::crc64_slice8(state, bytes, tables);
}
state = super::portable::crc64_slice8(state, left, tables);
state = update_simd_8way(state, middle, fold_1024b, combine, consts);
super::portable::crc64_slice8(state, right, tables)
}
}
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
unsafe fn crc64_pclmul_small(
mut state: u64,
bytes: &[u8],
consts: &Crc64ClmulConstants,
tables: &[[u64; 256]; 8],
) -> u64 {
unsafe {
let (left, middle, right) = bytes.align_to::<Simd>();
state = super::portable::crc64_slice8(state, left, tables);
let Some((first, rest)) = middle.split_first() else {
return super::portable::crc64_slice8(state, right, tables);
};
let mut acc = *first;
acc ^= Simd::new(0, state);
let coeff_16b = Simd::new(consts.tail_fold_16b[6].0, consts.tail_fold_16b[6].1);
for chunk in rest {
acc = *chunk ^ acc.fold_16(coeff_16b);
}
state = acc.fold_8(consts.fold_8b).barrett(consts.poly, consts.mu);
super::portable::crc64_slice8(state, right, tables)
}
}
#[inline]
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
unsafe fn fold16_4x(x: __m512i, coeff: __m512i) -> __m512i {
let h = _mm512_clmulepi64_epi128::<0x11>(x, coeff);
let l = _mm512_clmulepi64_epi128::<0x00>(x, coeff);
_mm512_xor_si512(h, l)
}
#[inline]
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
unsafe fn fold16_4x_ternlog(x: __m512i, data: __m512i, coeff: __m512i) -> __m512i {
let h = _mm512_clmulepi64_epi128::<0x11>(x, coeff);
let l = _mm512_clmulepi64_epi128::<0x00>(x, coeff);
_mm512_ternarylogic_epi64::<0x96>(data, h, l)
}
#[inline]
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
unsafe fn vpclmul_coeff(pair: (u64, u64)) -> __m512i {
_mm512_set_epi64(
pair.0.cast_signed(),
pair.1.cast_signed(),
pair.0.cast_signed(),
pair.1.cast_signed(),
pair.0.cast_signed(),
pair.1.cast_signed(),
pair.0.cast_signed(),
pair.1.cast_signed(),
)
}
#[inline]
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
unsafe fn load_m512<const ALIGNED: bool>(ptr: *const u8) -> __m512i {
unsafe {
if ALIGNED {
debug_assert_eq!((ptr as usize) & 63, 0);
_mm512_load_si512(ptr.cast::<__m512i>())
} else {
_mm512_loadu_si512(ptr.cast::<__m512i>())
}
}
}
#[inline]
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
unsafe fn load_128b_block<const ALIGNED: bool>(block: &[Simd; 8]) -> (__m512i, __m512i) {
unsafe {
let ptr = block as *const [Simd; 8] as *const u8;
let y0 = load_m512::<ALIGNED>(ptr);
let y1 = load_m512::<ALIGNED>(ptr.add(64));
(y0, y1)
}
}
#[inline]
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
unsafe fn finalize_vpclmul_state(x0: __m512i, x1: __m512i, consts: &Crc64ClmulConstants) -> u64 {
unsafe {
let lanes: [Simd; 8] = [
Simd(_mm512_extracti32x4_epi32::<0>(x0)),
Simd(_mm512_extracti32x4_epi32::<1>(x0)),
Simd(_mm512_extracti32x4_epi32::<2>(x0)),
Simd(_mm512_extracti32x4_epi32::<3>(x0)),
Simd(_mm512_extracti32x4_epi32::<0>(x1)),
Simd(_mm512_extracti32x4_epi32::<1>(x1)),
Simd(_mm512_extracti32x4_epi32::<2>(x1)),
Simd(_mm512_extracti32x4_epi32::<3>(x1)),
];
fold_tail(lanes, consts)
}
}
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
unsafe fn update_simd_vpclmul(state: u64, first: &[Simd; 8], rest: &[[Simd; 8]], consts: &Crc64ClmulConstants) -> u64 {
unsafe {
let aligned = ((first as *const [Simd; 8] as usize) & 63) == 0;
if aligned {
update_simd_vpclmul_impl::<true>(state, first, rest, consts)
} else {
update_simd_vpclmul_impl::<false>(state, first, rest, consts)
}
}
}
#[inline]
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
unsafe fn update_simd_vpclmul_impl<const ALIGNED: bool>(
state: u64,
first: &[Simd; 8],
rest: &[[Simd; 8]],
consts: &Crc64ClmulConstants,
) -> u64 {
unsafe {
let (mut x0, mut x1) = load_128b_block::<ALIGNED>(first);
let crc_mask = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0, state.cast_signed());
x0 = _mm512_xor_si512(x0, crc_mask);
let coeff = vpclmul_coeff(consts.fold_128b);
for chunk in rest {
let (y0, y1) = load_128b_block::<ALIGNED>(chunk);
x0 = fold16_4x_ternlog(x0, y0, coeff);
x1 = fold16_4x_ternlog(x1, y1, coeff);
}
finalize_vpclmul_state(x0, x1, consts)
}
}
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
unsafe fn crc64_vpclmul(mut state: u64, bytes: &[u8], consts: &Crc64ClmulConstants, tables: &[[u64; 256]; 8]) -> u64 {
unsafe {
let (left, middle, right) = bytes.align_to::<[Simd; 8]>();
if let Some((first, rest)) = middle.split_first() {
state = super::portable::crc64_slice8(state, left, tables);
state = update_simd_vpclmul(state, first, rest, consts);
super::portable::crc64_slice8(state, right, tables)
} else {
super::portable::crc64_slice8(state, bytes, tables)
}
}
}
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
unsafe fn update_simd_vpclmul_2way(
state: u64,
blocks: &[[Simd; 8]],
fold_256b: (u64, u64),
consts: &Crc64ClmulConstants,
) -> u64 {
unsafe {
let aligned = ((blocks.as_ptr() as usize) & 63) == 0;
if aligned {
update_simd_vpclmul_2way_impl::<true>(state, blocks, fold_256b, consts)
} else {
update_simd_vpclmul_2way_impl::<false>(state, blocks, fold_256b, consts)
}
}
}
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
unsafe fn update_simd_vpclmul_2way_impl<const ALIGNED: bool>(
state: u64,
blocks: &[[Simd; 8]],
fold_256b: (u64, u64),
consts: &Crc64ClmulConstants,
) -> u64 {
unsafe {
use crate::checksum::common::prefetch::{LARGE_BLOCK_DISTANCE, prefetch_read_l1};
debug_assert!(!blocks.is_empty());
if blocks.len() < 2 {
let Some((first, rest)) = blocks.split_first() else {
return state;
};
return update_simd_vpclmul(state, first, rest, consts);
}
let even = blocks.len() & !1usize;
let double_even = 2usize.strict_add(((blocks.len().strict_sub(2)) / 4).strict_mul(4));
let (mut x0_0, mut x1_0) = load_128b_block::<ALIGNED>(&blocks[0]);
let (mut x0_1, mut x1_1) = load_128b_block::<ALIGNED>(&blocks[1]);
let crc_mask = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0, state.cast_signed());
x0_0 = _mm512_xor_si512(x0_0, crc_mask);
let coeff_256 = vpclmul_coeff(fold_256b);
let coeff_128 = vpclmul_coeff(consts.fold_128b);
let mut i = 2;
while i < double_even {
let prefetch_ptr = blocks.as_ptr().add(i).cast::<u8>();
prefetch_read_l1(prefetch_ptr.wrapping_add(LARGE_BLOCK_DISTANCE));
let (y0, y1) = load_128b_block::<ALIGNED>(&blocks[i]);
x0_0 = fold16_4x_ternlog(x0_0, y0, coeff_256);
x1_0 = fold16_4x_ternlog(x1_0, y1, coeff_256);
let (y0, y1) = load_128b_block::<ALIGNED>(&blocks[i + 1]);
x0_1 = fold16_4x_ternlog(x0_1, y0, coeff_256);
x1_1 = fold16_4x_ternlog(x1_1, y1, coeff_256);
let (z0, z1) = load_128b_block::<ALIGNED>(&blocks[i + 2]);
x0_0 = fold16_4x_ternlog(x0_0, z0, coeff_256);
x1_0 = fold16_4x_ternlog(x1_0, z1, coeff_256);
let (z0, z1) = load_128b_block::<ALIGNED>(&blocks[i + 3]);
x0_1 = fold16_4x_ternlog(x0_1, z0, coeff_256);
x1_1 = fold16_4x_ternlog(x1_1, z1, coeff_256);
i = i.strict_add(4);
}
while i < even {
let (y0, y1) = load_128b_block::<ALIGNED>(&blocks[i]);
x0_0 = fold16_4x_ternlog(x0_0, y0, coeff_256);
x1_0 = fold16_4x_ternlog(x1_0, y1, coeff_256);
let (y0, y1) = load_128b_block::<ALIGNED>(&blocks[i + 1]);
x0_1 = fold16_4x_ternlog(x0_1, y0, coeff_256);
x1_1 = fold16_4x_ternlog(x1_1, y1, coeff_256);
i = i.strict_add(2);
}
let mut x0 = fold16_4x_ternlog(x0_0, x0_1, coeff_128);
let mut x1 = fold16_4x_ternlog(x1_0, x1_1, coeff_128);
if even != blocks.len() {
let (y0, y1) = load_128b_block::<ALIGNED>(&blocks[even]);
x0 = fold16_4x_ternlog(x0, y0, coeff_128);
x1 = fold16_4x_ternlog(x1, y1, coeff_128);
}
finalize_vpclmul_state(x0, x1, consts)
}
}
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
unsafe fn update_simd_vpclmul_4way(
state: u64,
blocks: &[[Simd; 8]],
fold_512b: (u64, u64),
combine: &[(u64, u64); 3],
consts: &Crc64ClmulConstants,
) -> u64 {
unsafe {
let aligned = ((blocks.as_ptr() as usize) & 63) == 0;
if aligned {
update_simd_vpclmul_4way_impl::<true>(state, blocks, fold_512b, combine, consts)
} else {
update_simd_vpclmul_4way_impl::<false>(state, blocks, fold_512b, combine, consts)
}
}
}
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
unsafe fn update_simd_vpclmul_4way_impl<const ALIGNED: bool>(
state: u64,
blocks: &[[Simd; 8]],
fold_512b: (u64, u64),
combine: &[(u64, u64); 3],
consts: &Crc64ClmulConstants,
) -> u64 {
unsafe {
use crate::checksum::common::prefetch::{LARGE_BLOCK_DISTANCE, prefetch_read_l1};
debug_assert!(!blocks.is_empty());
if blocks.len() < 4 {
let Some((first, rest)) = blocks.split_first() else {
return state;
};
return update_simd_vpclmul(state, first, rest, consts);
}
let aligned = (blocks.len() / 4) * 4;
let double_aligned = 4usize.strict_add(((blocks.len().strict_sub(4)) / 8).strict_mul(8));
let (mut x0_0, mut x1_0) = load_128b_block::<ALIGNED>(&blocks[0]);
let (mut x0_1, mut x1_1) = load_128b_block::<ALIGNED>(&blocks[1]);
let (mut x0_2, mut x1_2) = load_128b_block::<ALIGNED>(&blocks[2]);
let (mut x0_3, mut x1_3) = load_128b_block::<ALIGNED>(&blocks[3]);
let crc_mask = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0, state.cast_signed());
x0_0 = _mm512_xor_si512(x0_0, crc_mask);
let coeff_512 = vpclmul_coeff(fold_512b);
let coeff_128 = vpclmul_coeff(consts.fold_128b);
let c384 = vpclmul_coeff(combine[0]);
let c256 = vpclmul_coeff(combine[1]);
let c128 = vpclmul_coeff(combine[2]);
let mut i = 4;
while i < double_aligned {
let prefetch_ptr = blocks.as_ptr().add(i).cast::<u8>();
prefetch_read_l1(prefetch_ptr.wrapping_add(LARGE_BLOCK_DISTANCE));
let (y0, y1) = load_128b_block::<ALIGNED>(&blocks[i]);
x0_0 = fold16_4x_ternlog(x0_0, y0, coeff_512);
x1_0 = fold16_4x_ternlog(x1_0, y1, coeff_512);
let (y0, y1) = load_128b_block::<ALIGNED>(&blocks[i + 1]);
x0_1 = fold16_4x_ternlog(x0_1, y0, coeff_512);
x1_1 = fold16_4x_ternlog(x1_1, y1, coeff_512);
let (y0, y1) = load_128b_block::<ALIGNED>(&blocks[i + 2]);
x0_2 = fold16_4x_ternlog(x0_2, y0, coeff_512);
x1_2 = fold16_4x_ternlog(x1_2, y1, coeff_512);
let (y0, y1) = load_128b_block::<ALIGNED>(&blocks[i + 3]);
x0_3 = fold16_4x_ternlog(x0_3, y0, coeff_512);
x1_3 = fold16_4x_ternlog(x1_3, y1, coeff_512);
let (z0, z1) = load_128b_block::<ALIGNED>(&blocks[i + 4]);
x0_0 = fold16_4x_ternlog(x0_0, z0, coeff_512);
x1_0 = fold16_4x_ternlog(x1_0, z1, coeff_512);
let (z0, z1) = load_128b_block::<ALIGNED>(&blocks[i + 5]);
x0_1 = fold16_4x_ternlog(x0_1, z0, coeff_512);
x1_1 = fold16_4x_ternlog(x1_1, z1, coeff_512);
let (z0, z1) = load_128b_block::<ALIGNED>(&blocks[i + 6]);
x0_2 = fold16_4x_ternlog(x0_2, z0, coeff_512);
x1_2 = fold16_4x_ternlog(x1_2, z1, coeff_512);
let (z0, z1) = load_128b_block::<ALIGNED>(&blocks[i + 7]);
x0_3 = fold16_4x_ternlog(x0_3, z0, coeff_512);
x1_3 = fold16_4x_ternlog(x1_3, z1, coeff_512);
i = i.strict_add(8);
}
while i < aligned {
let (y0, y1) = load_128b_block::<ALIGNED>(&blocks[i]);
x0_0 = fold16_4x_ternlog(x0_0, y0, coeff_512);
x1_0 = fold16_4x_ternlog(x1_0, y1, coeff_512);
let (y0, y1) = load_128b_block::<ALIGNED>(&blocks[i + 1]);
x0_1 = fold16_4x_ternlog(x0_1, y0, coeff_512);
x1_1 = fold16_4x_ternlog(x1_1, y1, coeff_512);
let (y0, y1) = load_128b_block::<ALIGNED>(&blocks[i + 2]);
x0_2 = fold16_4x_ternlog(x0_2, y0, coeff_512);
x1_2 = fold16_4x_ternlog(x1_2, y1, coeff_512);
let (y0, y1) = load_128b_block::<ALIGNED>(&blocks[i + 3]);
x0_3 = fold16_4x_ternlog(x0_3, y0, coeff_512);
x1_3 = fold16_4x_ternlog(x1_3, y1, coeff_512);
i = i.strict_add(4);
}
let mut x0 = fold16_4x_ternlog(x0_2, x0_3, c128);
let mut x1 = fold16_4x_ternlog(x1_2, x1_3, c128);
x0 = fold16_4x_ternlog(x0_1, x0, c256);
x1 = fold16_4x_ternlog(x1_1, x1, c256);
x0 = fold16_4x_ternlog(x0_0, x0, c384);
x1 = fold16_4x_ternlog(x1_0, x1, c384);
for block in &blocks[aligned..] {
let (y0, y1) = load_128b_block::<ALIGNED>(block);
x0 = fold16_4x_ternlog(x0, y0, coeff_128);
x1 = fold16_4x_ternlog(x1, y1, coeff_128);
}
finalize_vpclmul_state(x0, x1, consts)
}
}
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
unsafe fn update_simd_vpclmul_7way(
state: u64,
blocks: &[[Simd; 8]],
fold_896b: (u64, u64),
combine: &[(u64, u64); 6],
consts: &Crc64ClmulConstants,
) -> u64 {
unsafe {
let aligned = ((blocks.as_ptr() as usize) & 63) == 0;
if aligned {
update_simd_vpclmul_7way_impl::<true>(state, blocks, fold_896b, combine, consts)
} else {
update_simd_vpclmul_7way_impl::<false>(state, blocks, fold_896b, combine, consts)
}
}
}
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
unsafe fn update_simd_vpclmul_7way_impl<const ALIGNED: bool>(
state: u64,
blocks: &[[Simd; 8]],
fold_896b: (u64, u64),
combine: &[(u64, u64); 6],
consts: &Crc64ClmulConstants,
) -> u64 {
unsafe {
use crate::checksum::common::prefetch::{LARGE_BLOCK_DISTANCE, prefetch_read_l1};
debug_assert!(!blocks.is_empty());
if blocks.len() < 7 {
let Some((first, rest)) = blocks.split_first() else {
return state;
};
return update_simd_vpclmul(state, first, rest, consts);
}
let aligned = (blocks.len() / 7) * 7;
let (mut x0_0, mut x1_0) = load_128b_block::<ALIGNED>(&blocks[0]);
let (mut x0_1, mut x1_1) = load_128b_block::<ALIGNED>(&blocks[1]);
let (mut x0_2, mut x1_2) = load_128b_block::<ALIGNED>(&blocks[2]);
let (mut x0_3, mut x1_3) = load_128b_block::<ALIGNED>(&blocks[3]);
let (mut x0_4, mut x1_4) = load_128b_block::<ALIGNED>(&blocks[4]);
let (mut x0_5, mut x1_5) = load_128b_block::<ALIGNED>(&blocks[5]);
let (mut x0_6, mut x1_6) = load_128b_block::<ALIGNED>(&blocks[6]);
let crc_mask = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0, state.cast_signed());
x0_0 = _mm512_xor_si512(x0_0, crc_mask);
let coeff_896 = vpclmul_coeff(fold_896b);
let coeff_128 = vpclmul_coeff(consts.fold_128b);
let c768 = vpclmul_coeff(combine[0]);
let c512 = vpclmul_coeff(combine[2]);
let c256 = vpclmul_coeff(combine[4]);
let c128 = vpclmul_coeff(combine[5]);
let mut i = 7;
while i < aligned {
let prefetch_ptr = blocks.as_ptr().add(i).cast::<u8>();
prefetch_read_l1(prefetch_ptr.wrapping_add(LARGE_BLOCK_DISTANCE));
let (y0, y1) = load_128b_block::<ALIGNED>(&blocks[i]);
x0_0 = fold16_4x_ternlog(x0_0, y0, coeff_896);
x1_0 = fold16_4x_ternlog(x1_0, y1, coeff_896);
let (y0, y1) = load_128b_block::<ALIGNED>(&blocks[i + 1]);
x0_1 = fold16_4x_ternlog(x0_1, y0, coeff_896);
x1_1 = fold16_4x_ternlog(x1_1, y1, coeff_896);
let (y0, y1) = load_128b_block::<ALIGNED>(&blocks[i + 2]);
x0_2 = fold16_4x_ternlog(x0_2, y0, coeff_896);
x1_2 = fold16_4x_ternlog(x1_2, y1, coeff_896);
let (y0, y1) = load_128b_block::<ALIGNED>(&blocks[i + 3]);
x0_3 = fold16_4x_ternlog(x0_3, y0, coeff_896);
x1_3 = fold16_4x_ternlog(x1_3, y1, coeff_896);
let (y0, y1) = load_128b_block::<ALIGNED>(&blocks[i + 4]);
x0_4 = fold16_4x_ternlog(x0_4, y0, coeff_896);
x1_4 = fold16_4x_ternlog(x1_4, y1, coeff_896);
let (y0, y1) = load_128b_block::<ALIGNED>(&blocks[i + 5]);
x0_5 = fold16_4x_ternlog(x0_5, y0, coeff_896);
x1_5 = fold16_4x_ternlog(x1_5, y1, coeff_896);
let (y0, y1) = load_128b_block::<ALIGNED>(&blocks[i + 6]);
x0_6 = fold16_4x_ternlog(x0_6, y0, coeff_896);
x1_6 = fold16_4x_ternlog(x1_6, y1, coeff_896);
i = i.strict_add(7);
}
let mut x0 = fold16_4x_ternlog(x0_5, x0_6, c128);
let mut x1 = fold16_4x_ternlog(x1_5, x1_6, c128);
let t0 = fold16_4x_ternlog(x0_3, x0_4, c128);
let t1 = fold16_4x_ternlog(x1_3, x1_4, c128);
x0 = fold16_4x_ternlog(t0, x0, c256);
x1 = fold16_4x_ternlog(t1, x1, c256);
let t0 = fold16_4x_ternlog(x0_1, x0_2, c128);
let t1 = fold16_4x_ternlog(x1_1, x1_2, c128);
x0 = fold16_4x_ternlog(t0, x0, c512);
x1 = fold16_4x_ternlog(t1, x1, c512);
x0 = fold16_4x_ternlog(x0_0, x0, c768);
x1 = fold16_4x_ternlog(x1_0, x1, c768);
for block in &blocks[aligned..] {
let (y0, y1) = load_128b_block::<ALIGNED>(block);
x0 = fold16_4x_ternlog(x0, y0, coeff_128);
x1 = fold16_4x_ternlog(x1, y1, coeff_128);
}
finalize_vpclmul_state(x0, x1, consts)
}
}
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
unsafe fn update_simd_vpclmul_8way(
state: u64,
blocks: &[[Simd; 8]],
fold_1024b: (u64, u64),
combine: &[(u64, u64); 7],
consts: &Crc64ClmulConstants,
) -> u64 {
unsafe {
let aligned = ((blocks.as_ptr() as usize) & 63) == 0;
if aligned {
update_simd_vpclmul_8way_impl::<true>(state, blocks, fold_1024b, combine, consts)
} else {
update_simd_vpclmul_8way_impl::<false>(state, blocks, fold_1024b, combine, consts)
}
}
}
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
unsafe fn update_simd_vpclmul_8way_impl<const ALIGNED: bool>(
state: u64,
blocks: &[[Simd; 8]],
fold_1024b: (u64, u64),
combine: &[(u64, u64); 7],
consts: &Crc64ClmulConstants,
) -> u64 {
unsafe {
use crate::checksum::common::prefetch::{LARGE_BLOCK_DISTANCE, prefetch_read_l1};
debug_assert!(!blocks.is_empty());
if blocks.len() < 8 {
let Some((first, rest)) = blocks.split_first() else {
return state;
};
return update_simd_vpclmul(state, first, rest, consts);
}
let aligned = (blocks.len() / 8) * 8;
let (mut x0_0, mut x1_0) = load_128b_block::<ALIGNED>(&blocks[0]);
let (mut x0_1, mut x1_1) = load_128b_block::<ALIGNED>(&blocks[1]);
let (mut x0_2, mut x1_2) = load_128b_block::<ALIGNED>(&blocks[2]);
let (mut x0_3, mut x1_3) = load_128b_block::<ALIGNED>(&blocks[3]);
let (mut x0_4, mut x1_4) = load_128b_block::<ALIGNED>(&blocks[4]);
let (mut x0_5, mut x1_5) = load_128b_block::<ALIGNED>(&blocks[5]);
let (mut x0_6, mut x1_6) = load_128b_block::<ALIGNED>(&blocks[6]);
let (mut x0_7, mut x1_7) = load_128b_block::<ALIGNED>(&blocks[7]);
let crc_mask = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0, state.cast_signed());
x0_0 = _mm512_xor_si512(x0_0, crc_mask);
let coeff_1024 = vpclmul_coeff(fold_1024b);
let coeff_128 = vpclmul_coeff(consts.fold_128b);
let c768 = vpclmul_coeff(combine[1]);
let c512 = vpclmul_coeff(combine[3]);
let c256 = vpclmul_coeff(combine[5]);
let c128 = vpclmul_coeff(combine[6]);
let mut i = 8;
while i < aligned {
let prefetch_ptr = blocks.as_ptr().add(i).cast::<u8>();
prefetch_read_l1(prefetch_ptr.wrapping_add(LARGE_BLOCK_DISTANCE));
let (y0, y1) = load_128b_block::<ALIGNED>(&blocks[i]);
x0_0 = fold16_4x_ternlog(x0_0, y0, coeff_1024);
x1_0 = fold16_4x_ternlog(x1_0, y1, coeff_1024);
let (y0, y1) = load_128b_block::<ALIGNED>(&blocks[i + 1]);
x0_1 = fold16_4x_ternlog(x0_1, y0, coeff_1024);
x1_1 = fold16_4x_ternlog(x1_1, y1, coeff_1024);
let (y0, y1) = load_128b_block::<ALIGNED>(&blocks[i + 2]);
x0_2 = fold16_4x_ternlog(x0_2, y0, coeff_1024);
x1_2 = fold16_4x_ternlog(x1_2, y1, coeff_1024);
let (y0, y1) = load_128b_block::<ALIGNED>(&blocks[i + 3]);
x0_3 = fold16_4x_ternlog(x0_3, y0, coeff_1024);
x1_3 = fold16_4x_ternlog(x1_3, y1, coeff_1024);
let (y0, y1) = load_128b_block::<ALIGNED>(&blocks[i + 4]);
x0_4 = fold16_4x_ternlog(x0_4, y0, coeff_1024);
x1_4 = fold16_4x_ternlog(x1_4, y1, coeff_1024);
let (y0, y1) = load_128b_block::<ALIGNED>(&blocks[i + 5]);
x0_5 = fold16_4x_ternlog(x0_5, y0, coeff_1024);
x1_5 = fold16_4x_ternlog(x1_5, y1, coeff_1024);
let (y0, y1) = load_128b_block::<ALIGNED>(&blocks[i + 6]);
x0_6 = fold16_4x_ternlog(x0_6, y0, coeff_1024);
x1_6 = fold16_4x_ternlog(x1_6, y1, coeff_1024);
let (y0, y1) = load_128b_block::<ALIGNED>(&blocks[i + 7]);
x0_7 = fold16_4x_ternlog(x0_7, y0, coeff_1024);
x1_7 = fold16_4x_ternlog(x1_7, y1, coeff_1024);
i = i.strict_add(8);
}
let mut x0 = fold16_4x_ternlog(x0_6, x0_7, c128);
let mut x1 = fold16_4x_ternlog(x1_6, x1_7, c128);
let t0 = fold16_4x_ternlog(x0_4, x0_5, c128);
let t1 = fold16_4x_ternlog(x1_4, x1_5, c128);
x0 = fold16_4x_ternlog(t0, x0, c256);
x1 = fold16_4x_ternlog(t1, x1, c256);
let t0 = fold16_4x_ternlog(x0_2, x0_3, c128);
let t1 = fold16_4x_ternlog(x1_2, x1_3, c128);
x0 = fold16_4x_ternlog(t0, x0, c512);
x1 = fold16_4x_ternlog(t1, x1, c512);
let t0 = fold16_4x_ternlog(x0_0, x0_1, c128);
let t1 = fold16_4x_ternlog(x1_0, x1_1, c128);
x0 = fold16_4x_ternlog(t0, x0, c768);
x1 = fold16_4x_ternlog(t1, x1, c768);
for block in &blocks[aligned..] {
let (y0, y1) = load_128b_block::<ALIGNED>(block);
x0 = fold16_4x_ternlog(x0, y0, coeff_128);
x1 = fold16_4x_ternlog(x1, y1, coeff_128);
}
finalize_vpclmul_state(x0, x1, consts)
}
}
#[inline]
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
unsafe fn finalize_4x512_state(
x0: __m512i,
x1: __m512i,
x2: __m512i,
x3: __m512i,
fold_128b: (u64, u64),
fold_64b: (u64, u64),
consts: &Crc64ClmulConstants,
) -> u64 {
unsafe {
let coeff_128 = vpclmul_coeff(fold_128b);
let r0 = fold16_4x_ternlog(x0, x2, coeff_128);
let r1 = fold16_4x_ternlog(x1, x3, coeff_128);
let coeff_64 = vpclmul_coeff(fold_64b);
let acc = fold16_4x_ternlog(r0, r1, coeff_64);
let l0 = Simd(_mm512_extracti32x4_epi32::<0>(acc));
let l1 = Simd(_mm512_extracti32x4_epi32::<1>(acc));
let l2 = Simd(_mm512_extracti32x4_epi32::<2>(acc));
let l3 = Simd(_mm512_extracti32x4_epi32::<3>(acc));
let c48 = Simd::new(consts.tail_fold_16b[4].0, consts.tail_fold_16b[4].1);
let c32 = Simd::new(consts.tail_fold_16b[5].0, consts.tail_fold_16b[5].1);
let c16 = Simd::new(consts.tail_fold_16b[6].0, consts.tail_fold_16b[6].1);
let mut res = l3;
res ^= l2.fold_16(c16);
res ^= l1.fold_16(c32);
res ^= l0.fold_16(c48);
res.fold_8(consts.fold_8b).barrett(consts.poly, consts.mu)
}
}
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
unsafe fn crc64_vpclmul_4x512(
mut state: u64,
bytes: &[u8],
fold_256b: (u64, u64),
consts: &Crc64ClmulConstants,
tables: &[[u64; 256]; 8],
) -> u64 {
unsafe {
use crate::checksum::common::prefetch::{LARGE_BLOCK_DISTANCE, prefetch_read_l1};
const BLOCK_SIZE: usize = 256; const DOUBLE_BLOCK: usize = 512;
if bytes.len() < BLOCK_SIZE {
let (left, middle, right) = bytes.align_to::<[Simd; 8]>();
if let Some((first, rest)) = middle.split_first() {
state = super::portable::crc64_slice8(state, left, tables);
state = update_simd_vpclmul(state, first, rest, consts);
return super::portable::crc64_slice8(state, right, tables);
}
return super::portable::crc64_slice8(state, bytes, tables);
}
let ptr = bytes.as_ptr();
let misalign = (ptr as usize) & 63;
let align_offset = if misalign == 0 { 0 } else { 64usize.strict_sub(misalign) }.min(bytes.len());
state = super::portable::crc64_slice8(state, &bytes[..align_offset], tables);
let aligned_bytes = &bytes[align_offset..];
if aligned_bytes.len() < BLOCK_SIZE {
return super::portable::crc64_slice8(state, aligned_bytes, tables);
}
let mut ptr = aligned_bytes.as_ptr();
let end = ptr.add(aligned_bytes.len());
debug_assert_eq!((ptr as usize) & 63, 0);
let mut x0 = _mm512_load_si512(ptr.cast::<__m512i>());
let mut x1 = _mm512_load_si512(ptr.add(64).cast::<__m512i>());
let mut x2 = _mm512_load_si512(ptr.add(128).cast::<__m512i>());
let mut x3 = _mm512_load_si512(ptr.add(192).cast::<__m512i>());
ptr = ptr.add(BLOCK_SIZE);
let crc_mask = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0, state.cast_signed());
x0 = _mm512_xor_si512(x0, crc_mask);
let coeff_256 = vpclmul_coeff(fold_256b);
while ptr.add(DOUBLE_BLOCK) <= end {
prefetch_read_l1(ptr.wrapping_add(LARGE_BLOCK_DISTANCE));
let y0 = _mm512_load_si512(ptr.cast::<__m512i>());
let y1 = _mm512_load_si512(ptr.add(64).cast::<__m512i>());
let y2 = _mm512_load_si512(ptr.add(128).cast::<__m512i>());
let y3 = _mm512_load_si512(ptr.add(192).cast::<__m512i>());
x0 = fold16_4x_ternlog(x0, y0, coeff_256);
x1 = fold16_4x_ternlog(x1, y1, coeff_256);
x2 = fold16_4x_ternlog(x2, y2, coeff_256);
x3 = fold16_4x_ternlog(x3, y3, coeff_256);
let z0 = _mm512_load_si512(ptr.add(256).cast::<__m512i>());
let z1 = _mm512_load_si512(ptr.add(320).cast::<__m512i>());
let z2 = _mm512_load_si512(ptr.add(384).cast::<__m512i>());
let z3 = _mm512_load_si512(ptr.add(448).cast::<__m512i>());
x0 = fold16_4x_ternlog(x0, z0, coeff_256);
x1 = fold16_4x_ternlog(x1, z1, coeff_256);
x2 = fold16_4x_ternlog(x2, z2, coeff_256);
x3 = fold16_4x_ternlog(x3, z3, coeff_256);
ptr = ptr.add(DOUBLE_BLOCK);
}
if ptr.add(BLOCK_SIZE) <= end {
let y0 = _mm512_load_si512(ptr.cast::<__m512i>());
let y1 = _mm512_load_si512(ptr.add(64).cast::<__m512i>());
let y2 = _mm512_load_si512(ptr.add(128).cast::<__m512i>());
let y3 = _mm512_load_si512(ptr.add(192).cast::<__m512i>());
x0 = fold16_4x_ternlog(x0, y0, coeff_256);
x1 = fold16_4x_ternlog(x1, y1, coeff_256);
x2 = fold16_4x_ternlog(x2, y2, coeff_256);
x3 = fold16_4x_ternlog(x3, y3, coeff_256);
ptr = ptr.add(BLOCK_SIZE);
}
state = finalize_4x512_state(x0, x1, x2, x3, consts.fold_128b, consts.tail_fold_16b[3], consts);
let remaining = end.offset_from(ptr) as usize;
super::portable::crc64_slice8(state, core::slice::from_raw_parts(ptr, remaining), tables)
}
}
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
unsafe fn crc64_vpclmul_2way(
mut state: u64,
bytes: &[u8],
fold_256b: (u64, u64),
consts: &Crc64ClmulConstants,
tables: &[[u64; 256]; 8],
) -> u64 {
unsafe {
let (left, middle, right) = bytes.align_to::<[Simd; 8]>();
if middle.is_empty() {
return super::portable::crc64_slice8(state, bytes, tables);
}
state = super::portable::crc64_slice8(state, left, tables);
state = update_simd_vpclmul_2way(state, middle, fold_256b, consts);
super::portable::crc64_slice8(state, right, tables)
}
}
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
unsafe fn crc64_vpclmul_4way(
mut state: u64,
bytes: &[u8],
fold_512b: (u64, u64),
combine: &[(u64, u64); 3],
consts: &Crc64ClmulConstants,
tables: &[[u64; 256]; 8],
) -> u64 {
unsafe {
let (left, middle, right) = bytes.align_to::<[Simd; 8]>();
if middle.is_empty() {
return super::portable::crc64_slice8(state, bytes, tables);
}
state = super::portable::crc64_slice8(state, left, tables);
state = update_simd_vpclmul_4way(state, middle, fold_512b, combine, consts);
super::portable::crc64_slice8(state, right, tables)
}
}
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
unsafe fn crc64_vpclmul_7way(
mut state: u64,
bytes: &[u8],
fold_896b: (u64, u64),
combine: &[(u64, u64); 6],
consts: &Crc64ClmulConstants,
tables: &[[u64; 256]; 8],
) -> u64 {
unsafe {
let (left, middle, right) = bytes.align_to::<[Simd; 8]>();
if middle.is_empty() {
return super::portable::crc64_slice8(state, bytes, tables);
}
state = super::portable::crc64_slice8(state, left, tables);
state = update_simd_vpclmul_7way(state, middle, fold_896b, combine, consts);
super::portable::crc64_slice8(state, right, tables)
}
}
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
unsafe fn crc64_vpclmul_8way(
mut state: u64,
bytes: &[u8],
fold_1024b: (u64, u64),
combine: &[(u64, u64); 7],
consts: &Crc64ClmulConstants,
tables: &[[u64; 256]; 8],
) -> u64 {
unsafe {
let (left, middle, right) = bytes.align_to::<[Simd; 8]>();
if middle.is_empty() {
return super::portable::crc64_slice8(state, bytes, tables);
}
state = super::portable::crc64_slice8(state, left, tables);
state = update_simd_vpclmul_8way(state, middle, fold_1024b, combine, consts);
super::portable::crc64_slice8(state, right, tables)
}
}
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
pub(crate) unsafe fn crc64_xz_pclmul(crc: u64, data: &[u8]) -> u64 {
unsafe {
crc64_pclmul(
crc,
data,
&crate::checksum::common::clmul::CRC64_XZ_CLMUL,
&super::kernel_tables::XZ_TABLES_8,
)
}
}
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
pub(crate) unsafe fn crc64_xz_pclmul_small(crc: u64, data: &[u8]) -> u64 {
unsafe {
crc64_pclmul_small(
crc,
data,
&crate::checksum::common::clmul::CRC64_XZ_CLMUL,
&super::kernel_tables::XZ_TABLES_8,
)
}
}
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
pub(crate) unsafe fn crc64_xz_pclmul_2way(crc: u64, data: &[u8]) -> u64 {
unsafe {
crc64_pclmul_2way(
crc,
data,
CRC64_XZ_STREAM.fold_256b,
&crate::checksum::common::clmul::CRC64_XZ_CLMUL,
&super::kernel_tables::XZ_TABLES_8,
)
}
}
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
pub(crate) unsafe fn crc64_xz_pclmul_4way(crc: u64, data: &[u8]) -> u64 {
unsafe {
crc64_pclmul_4way(
crc,
data,
CRC64_XZ_STREAM.fold_512b,
&CRC64_XZ_STREAM.combine_4way,
&crate::checksum::common::clmul::CRC64_XZ_CLMUL,
&super::kernel_tables::XZ_TABLES_8,
)
}
}
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
pub(crate) unsafe fn crc64_xz_pclmul_7way(crc: u64, data: &[u8]) -> u64 {
unsafe {
crc64_pclmul_7way(
crc,
data,
CRC64_XZ_STREAM.fold_896b,
&CRC64_XZ_STREAM.combine_7way,
&crate::checksum::common::clmul::CRC64_XZ_CLMUL,
&super::kernel_tables::XZ_TABLES_8,
)
}
}
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
pub(crate) unsafe fn crc64_xz_pclmul_8way(crc: u64, data: &[u8]) -> u64 {
unsafe {
crc64_pclmul_8way(
crc,
data,
CRC64_XZ_STREAM.fold_1024b,
&CRC64_XZ_STREAM.combine_8way,
&crate::checksum::common::clmul::CRC64_XZ_CLMUL,
&super::kernel_tables::XZ_TABLES_8,
)
}
}
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
pub(crate) unsafe fn crc64_xz_vpclmul(crc: u64, data: &[u8]) -> u64 {
unsafe {
crc64_vpclmul(
crc,
data,
&crate::checksum::common::clmul::CRC64_XZ_CLMUL,
&super::kernel_tables::XZ_TABLES_8,
)
}
}
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
pub(crate) unsafe fn crc64_xz_vpclmul_2way(crc: u64, data: &[u8]) -> u64 {
unsafe {
crc64_vpclmul_2way(
crc,
data,
CRC64_XZ_STREAM.fold_256b,
&crate::checksum::common::clmul::CRC64_XZ_CLMUL,
&super::kernel_tables::XZ_TABLES_8,
)
}
}
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
pub(crate) unsafe fn crc64_xz_vpclmul_4way(crc: u64, data: &[u8]) -> u64 {
unsafe {
crc64_vpclmul_4way(
crc,
data,
CRC64_XZ_STREAM.fold_512b,
&CRC64_XZ_STREAM.combine_4way,
&crate::checksum::common::clmul::CRC64_XZ_CLMUL,
&super::kernel_tables::XZ_TABLES_8,
)
}
}
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
pub(crate) unsafe fn crc64_xz_vpclmul_7way(crc: u64, data: &[u8]) -> u64 {
unsafe {
crc64_vpclmul_7way(
crc,
data,
CRC64_XZ_STREAM.fold_896b,
&CRC64_XZ_STREAM.combine_7way,
&crate::checksum::common::clmul::CRC64_XZ_CLMUL,
&super::kernel_tables::XZ_TABLES_8,
)
}
}
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
pub(crate) unsafe fn crc64_xz_vpclmul_8way(crc: u64, data: &[u8]) -> u64 {
unsafe {
crc64_vpclmul_8way(
crc,
data,
CRC64_XZ_STREAM.fold_1024b,
&CRC64_XZ_STREAM.combine_8way,
&crate::checksum::common::clmul::CRC64_XZ_CLMUL,
&super::kernel_tables::XZ_TABLES_8,
)
}
}
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
pub(crate) unsafe fn crc64_xz_vpclmul_4x512(crc: u64, data: &[u8]) -> u64 {
unsafe {
crc64_vpclmul_4x512(
crc,
data,
CRC64_XZ_STREAM.fold_256b,
&crate::checksum::common::clmul::CRC64_XZ_CLMUL,
&super::kernel_tables::XZ_TABLES_8,
)
}
}
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
pub(crate) unsafe fn crc64_nvme_pclmul(crc: u64, data: &[u8]) -> u64 {
unsafe {
crc64_pclmul(
crc,
data,
&crate::checksum::common::clmul::CRC64_NVME_CLMUL,
&super::kernel_tables::NVME_TABLES_8,
)
}
}
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
pub(crate) unsafe fn crc64_nvme_pclmul_small(crc: u64, data: &[u8]) -> u64 {
unsafe {
crc64_pclmul_small(
crc,
data,
&crate::checksum::common::clmul::CRC64_NVME_CLMUL,
&super::kernel_tables::NVME_TABLES_8,
)
}
}
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
pub(crate) unsafe fn crc64_nvme_pclmul_2way(crc: u64, data: &[u8]) -> u64 {
unsafe {
crc64_pclmul_2way(
crc,
data,
CRC64_NVME_STREAM.fold_256b,
&crate::checksum::common::clmul::CRC64_NVME_CLMUL,
&super::kernel_tables::NVME_TABLES_8,
)
}
}
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
pub(crate) unsafe fn crc64_nvme_pclmul_4way(crc: u64, data: &[u8]) -> u64 {
unsafe {
crc64_pclmul_4way(
crc,
data,
CRC64_NVME_STREAM.fold_512b,
&CRC64_NVME_STREAM.combine_4way,
&crate::checksum::common::clmul::CRC64_NVME_CLMUL,
&super::kernel_tables::NVME_TABLES_8,
)
}
}
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
pub(crate) unsafe fn crc64_nvme_pclmul_7way(crc: u64, data: &[u8]) -> u64 {
unsafe {
crc64_pclmul_7way(
crc,
data,
CRC64_NVME_STREAM.fold_896b,
&CRC64_NVME_STREAM.combine_7way,
&crate::checksum::common::clmul::CRC64_NVME_CLMUL,
&super::kernel_tables::NVME_TABLES_8,
)
}
}
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
pub(crate) unsafe fn crc64_nvme_pclmul_8way(crc: u64, data: &[u8]) -> u64 {
unsafe {
crc64_pclmul_8way(
crc,
data,
CRC64_NVME_STREAM.fold_1024b,
&CRC64_NVME_STREAM.combine_8way,
&crate::checksum::common::clmul::CRC64_NVME_CLMUL,
&super::kernel_tables::NVME_TABLES_8,
)
}
}
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
pub(crate) unsafe fn crc64_nvme_vpclmul(crc: u64, data: &[u8]) -> u64 {
unsafe {
crc64_vpclmul(
crc,
data,
&crate::checksum::common::clmul::CRC64_NVME_CLMUL,
&super::kernel_tables::NVME_TABLES_8,
)
}
}
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
pub(crate) unsafe fn crc64_nvme_vpclmul_2way(crc: u64, data: &[u8]) -> u64 {
unsafe {
crc64_vpclmul_2way(
crc,
data,
CRC64_NVME_STREAM.fold_256b,
&crate::checksum::common::clmul::CRC64_NVME_CLMUL,
&super::kernel_tables::NVME_TABLES_8,
)
}
}
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
pub(crate) unsafe fn crc64_nvme_vpclmul_4way(crc: u64, data: &[u8]) -> u64 {
unsafe {
crc64_vpclmul_4way(
crc,
data,
CRC64_NVME_STREAM.fold_512b,
&CRC64_NVME_STREAM.combine_4way,
&crate::checksum::common::clmul::CRC64_NVME_CLMUL,
&super::kernel_tables::NVME_TABLES_8,
)
}
}
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
pub(crate) unsafe fn crc64_nvme_vpclmul_7way(crc: u64, data: &[u8]) -> u64 {
unsafe {
crc64_vpclmul_7way(
crc,
data,
CRC64_NVME_STREAM.fold_896b,
&CRC64_NVME_STREAM.combine_7way,
&crate::checksum::common::clmul::CRC64_NVME_CLMUL,
&super::kernel_tables::NVME_TABLES_8,
)
}
}
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
pub(crate) unsafe fn crc64_nvme_vpclmul_8way(crc: u64, data: &[u8]) -> u64 {
unsafe {
crc64_vpclmul_8way(
crc,
data,
CRC64_NVME_STREAM.fold_1024b,
&CRC64_NVME_STREAM.combine_8way,
&crate::checksum::common::clmul::CRC64_NVME_CLMUL,
&super::kernel_tables::NVME_TABLES_8,
)
}
}
#[target_feature(enable = "avx512f", enable = "vpclmulqdq")]
pub(crate) unsafe fn crc64_nvme_vpclmul_4x512(crc: u64, data: &[u8]) -> u64 {
unsafe {
crc64_vpclmul_4x512(
crc,
data,
CRC64_NVME_STREAM.fold_256b,
&crate::checksum::common::clmul::CRC64_NVME_CLMUL,
&super::kernel_tables::NVME_TABLES_8,
)
}
}
#[inline]
pub fn crc64_xz_pclmul_safe(crc: u64, data: &[u8]) -> u64 {
unsafe { crc64_xz_pclmul(crc, data) }
}
#[inline]
pub fn crc64_xz_pclmul_small_safe(crc: u64, data: &[u8]) -> u64 {
unsafe { crc64_xz_pclmul_small(crc, data) }
}
#[inline]
pub fn crc64_xz_pclmul_2way_safe(crc: u64, data: &[u8]) -> u64 {
unsafe { crc64_xz_pclmul_2way(crc, data) }
}
#[inline]
pub fn crc64_xz_pclmul_4way_safe(crc: u64, data: &[u8]) -> u64 {
unsafe { crc64_xz_pclmul_4way(crc, data) }
}
#[inline]
pub fn crc64_xz_pclmul_7way_safe(crc: u64, data: &[u8]) -> u64 {
unsafe { crc64_xz_pclmul_7way(crc, data) }
}
#[inline]
pub fn crc64_xz_pclmul_8way_safe(crc: u64, data: &[u8]) -> u64 {
unsafe { crc64_xz_pclmul_8way(crc, data) }
}
#[inline]
pub fn crc64_xz_vpclmul_safe(crc: u64, data: &[u8]) -> u64 {
unsafe { crc64_xz_vpclmul(crc, data) }
}
#[inline]
pub fn crc64_xz_vpclmul_2way_safe(crc: u64, data: &[u8]) -> u64 {
unsafe { crc64_xz_vpclmul_2way(crc, data) }
}
#[inline]
pub fn crc64_xz_vpclmul_4way_safe(crc: u64, data: &[u8]) -> u64 {
unsafe { crc64_xz_vpclmul_4way(crc, data) }
}
#[inline]
pub fn crc64_xz_vpclmul_7way_safe(crc: u64, data: &[u8]) -> u64 {
unsafe { crc64_xz_vpclmul_7way(crc, data) }
}
#[inline]
pub fn crc64_xz_vpclmul_8way_safe(crc: u64, data: &[u8]) -> u64 {
unsafe { crc64_xz_vpclmul_8way(crc, data) }
}
#[inline]
pub fn crc64_xz_vpclmul_4x512_safe(crc: u64, data: &[u8]) -> u64 {
unsafe { crc64_xz_vpclmul_4x512(crc, data) }
}
#[inline]
pub fn crc64_nvme_pclmul_safe(crc: u64, data: &[u8]) -> u64 {
unsafe { crc64_nvme_pclmul(crc, data) }
}
#[inline]
pub fn crc64_nvme_pclmul_small_safe(crc: u64, data: &[u8]) -> u64 {
unsafe { crc64_nvme_pclmul_small(crc, data) }
}
#[inline]
pub fn crc64_nvme_pclmul_2way_safe(crc: u64, data: &[u8]) -> u64 {
unsafe { crc64_nvme_pclmul_2way(crc, data) }
}
#[inline]
pub fn crc64_nvme_pclmul_4way_safe(crc: u64, data: &[u8]) -> u64 {
unsafe { crc64_nvme_pclmul_4way(crc, data) }
}
#[inline]
pub fn crc64_nvme_pclmul_7way_safe(crc: u64, data: &[u8]) -> u64 {
unsafe { crc64_nvme_pclmul_7way(crc, data) }
}
#[inline]
pub fn crc64_nvme_pclmul_8way_safe(crc: u64, data: &[u8]) -> u64 {
unsafe { crc64_nvme_pclmul_8way(crc, data) }
}
#[inline]
pub fn crc64_nvme_vpclmul_safe(crc: u64, data: &[u8]) -> u64 {
unsafe { crc64_nvme_vpclmul(crc, data) }
}
#[inline]
pub fn crc64_nvme_vpclmul_2way_safe(crc: u64, data: &[u8]) -> u64 {
unsafe { crc64_nvme_vpclmul_2way(crc, data) }
}
#[inline]
pub fn crc64_nvme_vpclmul_4way_safe(crc: u64, data: &[u8]) -> u64 {
unsafe { crc64_nvme_vpclmul_4way(crc, data) }
}
#[inline]
pub fn crc64_nvme_vpclmul_7way_safe(crc: u64, data: &[u8]) -> u64 {
unsafe { crc64_nvme_vpclmul_7way(crc, data) }
}
#[inline]
pub fn crc64_nvme_vpclmul_8way_safe(crc: u64, data: &[u8]) -> u64 {
unsafe { crc64_nvme_vpclmul_8way(crc, data) }
}
#[inline]
pub fn crc64_nvme_vpclmul_4x512_safe(crc: u64, data: &[u8]) -> u64 {
unsafe { crc64_nvme_vpclmul_4x512(crc, data) }
}
#[cfg(all(test, not(miri)))]
mod tests {
extern crate std;
use alloc::vec::Vec;
use super::*;
fn make_data(len: usize) -> Vec<u8> {
(0..len)
.map(|i| (i as u8).wrapping_mul(17).wrapping_add((i >> 3) as u8))
.collect()
}
#[test]
fn test_crc64_xz_pclmul_matches_vector() {
if !std::arch::is_x86_feature_detected!("pclmulqdq") {
return;
}
let crc = crc64_xz_pclmul_safe(!0, b"123456789") ^ !0;
assert_eq!(crc, 0x995D_C9BB_DF19_39FA);
}
#[test]
fn test_crc64_nvme_pclmul_matches_vector() {
if !std::arch::is_x86_feature_detected!("pclmulqdq") {
return;
}
let crc = crc64_nvme_pclmul_safe(!0, b"123456789") ^ !0;
assert_eq!(crc, 0xAE8B_1486_0A79_9888);
}
#[test]
fn test_crc64_xz_pclmul_matches_portable_various_lengths() {
if !std::arch::is_x86_feature_detected!("pclmulqdq") {
return;
}
for len in [
0, 1, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128, 129, 255, 256, 511, 512, 1024,
] {
let data = make_data(len);
let portable = super::super::portable::crc64_slice8_xz(!0, &data) ^ !0;
let pclmul = crc64_xz_pclmul_safe(!0, &data) ^ !0;
assert_eq!(pclmul, portable, "mismatch at len={len}");
}
}
#[test]
fn test_crc64_nvme_pclmul_matches_portable_various_lengths() {
if !std::arch::is_x86_feature_detected!("pclmulqdq") {
return;
}
for len in [
0, 1, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128, 129, 255, 256, 511, 512, 1024,
] {
let data = make_data(len);
let portable = super::super::portable::crc64_slice8_nvme(!0, &data) ^ !0;
let pclmul = crc64_nvme_pclmul_safe(!0, &data) ^ !0;
assert_eq!(pclmul, portable, "mismatch at len={len}");
}
}
#[test]
fn test_crc64_xz_pclmul_small_matches_portable_all_lengths_0_511() {
if !std::arch::is_x86_feature_detected!("pclmulqdq") {
return;
}
for len in 0..512 {
let data = make_data(len);
let portable = super::super::portable::crc64_slice8_xz(!0, &data) ^ !0;
let pclmul_small = crc64_xz_pclmul_small_safe(!0, &data) ^ !0;
assert_eq!(pclmul_small, portable, "mismatch at len={len}");
}
}
#[test]
fn test_crc64_nvme_pclmul_small_matches_portable_all_lengths_0_511() {
if !std::arch::is_x86_feature_detected!("pclmulqdq") {
return;
}
for len in 0..512 {
let data = make_data(len);
let portable = super::super::portable::crc64_slice8_nvme(!0, &data) ^ !0;
let pclmul_small = crc64_nvme_pclmul_small_safe(!0, &data) ^ !0;
assert_eq!(pclmul_small, portable, "mismatch at len={len}");
}
}
#[test]
fn test_crc64_xz_pclmul_2way_matches_portable_various_lengths() {
if !std::arch::is_x86_feature_detected!("pclmulqdq") {
return;
}
for len in [0usize, 1, 7, 16, 63, 64, 127, 128, 255, 256, 1024, 4096, 16 * 1024] {
let data = make_data(len);
let portable = super::super::portable::crc64_slice8_xz(!0, &data) ^ !0;
let pclmul_2way = crc64_xz_pclmul_2way_safe(!0, &data) ^ !0;
assert_eq!(pclmul_2way, portable, "mismatch at len={len}");
}
}
#[test]
fn test_crc64_nvme_pclmul_2way_matches_portable_various_lengths() {
if !std::arch::is_x86_feature_detected!("pclmulqdq") {
return;
}
for len in [0usize, 1, 7, 16, 63, 64, 127, 128, 255, 256, 1024, 4096, 16 * 1024] {
let data = make_data(len);
let portable = super::super::portable::crc64_slice8_nvme(!0, &data) ^ !0;
let pclmul_2way = crc64_nvme_pclmul_2way_safe(!0, &data) ^ !0;
assert_eq!(pclmul_2way, portable, "mismatch at len={len}");
}
}
#[test]
fn test_crc64_xz_pclmul_multiway_matches_portable_various_lengths() {
if !std::arch::is_x86_feature_detected!("pclmulqdq") {
return;
}
for len in [0usize, 1, 7, 16, 63, 64, 127, 128, 255, 256, 512, 1024, 4096, 16 * 1024] {
let data = make_data(len);
let portable = super::super::portable::crc64_slice8_xz(!0, &data) ^ !0;
let vp2 = crc64_xz_pclmul_2way_safe(!0, &data) ^ !0;
let vp4 = crc64_xz_pclmul_4way_safe(!0, &data) ^ !0;
let vp7 = crc64_xz_pclmul_7way_safe(!0, &data) ^ !0;
assert_eq!(vp2, portable, "2-way mismatch at len={len}");
assert_eq!(vp4, portable, "4-way mismatch at len={len}");
assert_eq!(vp7, portable, "7-way mismatch at len={len}");
}
}
#[test]
fn test_crc64_nvme_pclmul_multiway_matches_portable_various_lengths() {
if !std::arch::is_x86_feature_detected!("pclmulqdq") {
return;
}
for len in [0usize, 1, 7, 16, 63, 64, 127, 128, 255, 256, 512, 1024, 4096, 16 * 1024] {
let data = make_data(len);
let portable = super::super::portable::crc64_slice8_nvme(!0, &data) ^ !0;
let vp2 = crc64_nvme_pclmul_2way_safe(!0, &data) ^ !0;
let vp4 = crc64_nvme_pclmul_4way_safe(!0, &data) ^ !0;
let vp7 = crc64_nvme_pclmul_7way_safe(!0, &data) ^ !0;
assert_eq!(vp2, portable, "2-way mismatch at len={len}");
assert_eq!(vp4, portable, "4-way mismatch at len={len}");
assert_eq!(vp7, portable, "7-way mismatch at len={len}");
}
}
#[test]
fn test_crc64_xz_vpclmul_multiway_matches_portable_various_lengths() {
if !(std::arch::is_x86_feature_detected!("avx512f") && std::arch::is_x86_feature_detected!("vpclmulqdq")) {
return;
}
for len in [0usize, 1, 7, 16, 63, 64, 127, 128, 255, 256, 512, 1024, 4096, 16 * 1024] {
let data = make_data(len);
let portable = super::super::portable::crc64_slice8_xz(!0, &data) ^ !0;
let vp2 = crc64_xz_vpclmul_2way_safe(!0, &data) ^ !0;
let vp4 = crc64_xz_vpclmul_4way_safe(!0, &data) ^ !0;
let vp7 = crc64_xz_vpclmul_7way_safe(!0, &data) ^ !0;
assert_eq!(vp2, portable, "2-way mismatch at len={len}");
assert_eq!(vp4, portable, "4-way mismatch at len={len}");
assert_eq!(vp7, portable, "7-way mismatch at len={len}");
}
}
#[test]
fn test_crc64_nvme_vpclmul_multiway_matches_portable_various_lengths() {
if !(std::arch::is_x86_feature_detected!("avx512f") && std::arch::is_x86_feature_detected!("vpclmulqdq")) {
return;
}
for len in [0usize, 1, 7, 16, 63, 64, 127, 128, 255, 256, 512, 1024, 4096, 16 * 1024] {
let data = make_data(len);
let portable = super::super::portable::crc64_slice8_nvme(!0, &data) ^ !0;
let vp2 = crc64_nvme_vpclmul_2way_safe(!0, &data) ^ !0;
let vp4 = crc64_nvme_vpclmul_4way_safe(!0, &data) ^ !0;
let vp7 = crc64_nvme_vpclmul_7way_safe(!0, &data) ^ !0;
assert_eq!(vp2, portable, "2-way mismatch at len={len}");
assert_eq!(vp4, portable, "4-way mismatch at len={len}");
assert_eq!(vp7, portable, "7-way mismatch at len={len}");
}
}
}