use crate::checksum::{Checksum, ChecksumError};
use crate::phys::{ChecksumType, EndianOrder};
use core::cmp;
use core::fmt;
use core::fmt::Display;
#[cfg(all(
target_arch = "x86",
any(
feature = "fletcher2-sse2",
feature = "fletcher2-ssse3",
feature = "fletcher2-avx2",
feature = "fletcher2-avx512f",
feature = "fletcher2-avx512bw",
),
))]
use core::arch::x86 as arch;
#[cfg(all(
target_arch = "x86_64",
any(
feature = "fletcher2-sse2",
feature = "fletcher2-ssse3",
feature = "fletcher2-avx2",
feature = "fletcher2-avx512f",
feature = "fletcher2-avx512bw",
),
))]
use core::arch::x86_64 as arch;
#[cfg(all(
any(target_arch = "x86", target_arch = "x86_64",),
any(feature = "fletcher2-sse2", feature = "fletcher2-ssse3",),
))]
use crate::arch::x86_any::is_sse2_supported;
#[cfg(all(
any(target_arch = "x86", target_arch = "x86_64",),
feature = "fletcher2-ssse3",
))]
use crate::arch::x86_any::is_ssse3_supported;
#[cfg(all(
any(target_arch = "x86", target_arch = "x86_64",),
feature = "fletcher2-avx2",
))]
use crate::arch::x86_any::{is_avx2_supported, is_avx_supported};
#[cfg(all(
any(target_arch = "x86", target_arch = "x86_64",),
any(feature = "fletcher2-avx512f", feature = "fletcher2-avx512bw"),
))]
use crate::arch::x86_any::is_avx512f_supported;
#[cfg(all(
any(target_arch = "x86", target_arch = "x86_64",),
feature = "fletcher2-avx512bw",
))]
use crate::arch::x86_any::is_avx512bw_supported;
const FLETCHER_2_BLOCK_SIZE: usize = 16;
const FLETCHER_2_U64_COUNT: usize = 4;
const FLETCHER_2_MAX_SIMD_WIDTH: usize = 4;
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum Fletcher2Implementation {
Generic,
SuperScalar2,
SuperScalar4,
SSE2,
SSSE3,
AVX2,
AVX512F,
AVX512BW,
}
const ALL_FLETCHER_2_IMPLEMENTATIONS: [Fletcher2Implementation; 8] = [
Fletcher2Implementation::Generic,
Fletcher2Implementation::SuperScalar2,
Fletcher2Implementation::SuperScalar4,
Fletcher2Implementation::SSE2,
Fletcher2Implementation::SSSE3,
Fletcher2Implementation::AVX2,
Fletcher2Implementation::AVX512F,
Fletcher2Implementation::AVX512BW,
];
impl Fletcher2Implementation {
pub fn all() -> &'static [Fletcher2Implementation] {
&ALL_FLETCHER_2_IMPLEMENTATIONS
}
pub fn to_str(&self) -> &'static str {
match self {
Fletcher2Implementation::Generic => "generic",
Fletcher2Implementation::SuperScalar2 => "superscalar2",
Fletcher2Implementation::SuperScalar4 => "superscalar4",
Fletcher2Implementation::SSE2 => "sse2",
Fletcher2Implementation::SSSE3 => "ssse3",
Fletcher2Implementation::AVX2 => "avx2",
Fletcher2Implementation::AVX512F => "avx512f",
Fletcher2Implementation::AVX512BW => "avx512bw",
}
}
fn get_implementation_ctx(&self) -> Result<&'static Fletcher2ImplementationCtx, ChecksumError> {
let ctx = match self {
Fletcher2Implementation::Generic => &FLETCHER_2_IMPL_CTX_GENERIC,
Fletcher2Implementation::SuperScalar2 => &FLETCHER_2_IMPL_CTX_SUPERSCALAR_2,
Fletcher2Implementation::SuperScalar4 => &FLETCHER_2_IMPL_CTX_SUPERSCALAR_4,
#[cfg(feature = "fletcher2-sse2")]
Fletcher2Implementation::SSE2 => &FLETCHER_2_IMPL_CTX_SSE2,
#[cfg(feature = "fletcher2-ssse3")]
Fletcher2Implementation::SSSE3 => &FLETCHER_2_IMPL_CTX_SSSE3,
#[cfg(feature = "fletcher2-avx2")]
Fletcher2Implementation::AVX2 => &FLETCHER_2_IMPL_CTX_AVX2,
#[cfg(feature = "fletcher2-avx512f")]
Fletcher2Implementation::AVX512F => &FLETCHER_2_IMPL_CTX_AVX512F,
#[cfg(feature = "fletcher2-avx512bw")]
Fletcher2Implementation::AVX512BW => &FLETCHER_2_IMPL_CTX_AVX512BW,
#[cfg(any(
not(feature = "fletcher2-sse2"),
not(feature = "fletcher2-ssse3"),
not(feature = "fletcher2-avx2"),
not(feature = "fletcher2-avx512f"),
not(feature = "fletcher2-avx512bw"),
))]
_ => {
return Err(ChecksumError::Unsupported {
checksum: ChecksumType::Fletcher2,
implementation: self.to_str(),
})
}
};
Ok(ctx)
}
}
impl Display for Fletcher2Implementation {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.to_str())
}
}
type Fletcher2UpdateBlock = fn(state: &mut [u64], data: &[u8]);
type Fletcher2FinishBlocks = fn(state: &[u64]) -> [u64; FLETCHER_2_U64_COUNT];
type Fletcher2IsSupported = fn() -> bool;
struct Fletcher2ImplementationCtx {
block_size: usize,
update_blocks_big: Fletcher2UpdateBlock,
update_blocks_little: Fletcher2UpdateBlock,
finish_blocks: Fletcher2FinishBlocks,
is_supported: Fletcher2IsSupported,
}
const FLETCHER_2_IMPL_CTX_GENERIC: Fletcher2ImplementationCtx = Fletcher2ImplementationCtx {
block_size: FLETCHER_2_BLOCK_SIZE,
update_blocks_big: Fletcher2::update_blocks_generic_big,
update_blocks_little: Fletcher2::update_blocks_generic_little,
finish_blocks: Fletcher2::finish_blocks_single_stream,
is_supported: || true,
};
const FLETCHER_2_IMPL_CTX_SUPERSCALAR_2: Fletcher2ImplementationCtx = Fletcher2ImplementationCtx {
block_size: 2 * FLETCHER_2_BLOCK_SIZE,
update_blocks_big: Fletcher2::update_blocks_superscalar2_big,
update_blocks_little: Fletcher2::update_blocks_superscalar2_little,
finish_blocks: Fletcher2::finish_blocks_dual_stream,
is_supported: || true,
};
const FLETCHER_2_IMPL_CTX_SUPERSCALAR_4: Fletcher2ImplementationCtx = Fletcher2ImplementationCtx {
block_size: 4 * FLETCHER_2_BLOCK_SIZE,
update_blocks_big: Fletcher2::update_blocks_superscalar4_big,
update_blocks_little: Fletcher2::update_blocks_superscalar4_little,
finish_blocks: Fletcher2::finish_blocks_quad_stream,
is_supported: || true,
};
#[cfg(feature = "fletcher2-sse2")]
const FLETCHER_2_IMPL_CTX_SSE2: Fletcher2ImplementationCtx = Fletcher2ImplementationCtx {
block_size: FLETCHER_2_BLOCK_SIZE,
#[cfg(target_endian = "big")]
update_blocks_big: Fletcher2::update_blocks_sse2_native,
#[cfg(target_endian = "big")]
update_blocks_little: Fletcher2::update_blocks_sse2_byteswap,
#[cfg(target_endian = "little")]
update_blocks_big: Fletcher2::update_blocks_sse2_byteswap,
#[cfg(target_endian = "little")]
update_blocks_little: Fletcher2::update_blocks_sse2_native,
finish_blocks: Fletcher2::finish_blocks_single_stream,
is_supported: is_sse2_supported,
};
#[cfg(feature = "fletcher2-ssse3")]
const FLETCHER_2_IMPL_CTX_SSSE3: Fletcher2ImplementationCtx = Fletcher2ImplementationCtx {
block_size: FLETCHER_2_BLOCK_SIZE,
#[cfg(target_endian = "big")]
update_blocks_big: Fletcher2::update_blocks_sse2_native,
#[cfg(target_endian = "big")]
update_blocks_little: Fletcher2::update_blocks_ssse3_byteswap,
#[cfg(target_endian = "little")]
update_blocks_big: Fletcher2::update_blocks_ssse3_byteswap,
#[cfg(target_endian = "little")]
update_blocks_little: Fletcher2::update_blocks_sse2_native,
finish_blocks: Fletcher2::finish_blocks_single_stream,
is_supported: || is_sse2_supported() && is_ssse3_supported(),
};
#[cfg(feature = "fletcher2-avx2")]
const FLETCHER_2_IMPL_CTX_AVX2: Fletcher2ImplementationCtx = Fletcher2ImplementationCtx {
block_size: 2 * FLETCHER_2_BLOCK_SIZE,
#[cfg(target_endian = "big")]
update_blocks_big: Fletcher2::update_blocks_avx2_native,
#[cfg(target_endian = "big")]
update_blocks_little: Fletcher2::update_blocks_avx2_byteswap,
#[cfg(target_endian = "little")]
update_blocks_big: Fletcher2::update_blocks_avx2_byteswap,
#[cfg(target_endian = "little")]
update_blocks_little: Fletcher2::update_blocks_avx2_native,
finish_blocks: Fletcher2::finish_blocks_dual_stream,
is_supported: || is_avx_supported() && is_avx2_supported(),
};
#[cfg(feature = "fletcher2-avx512f")]
const FLETCHER_2_IMPL_CTX_AVX512F: Fletcher2ImplementationCtx = Fletcher2ImplementationCtx {
block_size: 4 * FLETCHER_2_BLOCK_SIZE,
#[cfg(target_endian = "big")]
update_blocks_big: Fletcher2::update_blocks_avx512f_native,
#[cfg(target_endian = "big")]
update_blocks_little: Fletcher2::update_blocks_avx512f_byteswap,
#[cfg(target_endian = "little")]
update_blocks_big: Fletcher2::update_blocks_avx512f_byteswap,
#[cfg(target_endian = "little")]
update_blocks_little: Fletcher2::update_blocks_avx512f_native,
finish_blocks: Fletcher2::finish_blocks_quad_stream,
is_supported: is_avx512f_supported,
};
#[cfg(feature = "fletcher2-avx512bw")]
const FLETCHER_2_IMPL_CTX_AVX512BW: Fletcher2ImplementationCtx = Fletcher2ImplementationCtx {
block_size: 4 * FLETCHER_2_BLOCK_SIZE,
#[cfg(target_endian = "big")]
update_blocks_big: Fletcher2::update_blocks_avx512f_native,
#[cfg(target_endian = "big")]
update_blocks_little: Fletcher2::update_blocks_avx512bw_byteswap,
#[cfg(target_endian = "little")]
update_blocks_big: Fletcher2::update_blocks_avx512bw_byteswap,
#[cfg(target_endian = "little")]
update_blocks_little: Fletcher2::update_blocks_avx512f_native,
finish_blocks: Fletcher2::finish_blocks_quad_stream,
is_supported: || is_avx512f_supported() && is_avx512bw_supported(),
};
pub struct Fletcher2 {
buffer_fill: usize,
buffer: [u8; FLETCHER_2_BLOCK_SIZE * FLETCHER_2_MAX_SIMD_WIDTH],
state: [u64; FLETCHER_2_U64_COUNT * FLETCHER_2_MAX_SIMD_WIDTH],
order: EndianOrder,
impl_ctx: &'static Fletcher2ImplementationCtx,
update_blocks: Fletcher2UpdateBlock,
}
impl Fletcher2 {
pub fn new(implementation: Fletcher2Implementation) -> Result<Fletcher2, ChecksumError> {
let ctx = implementation.get_implementation_ctx()?;
if !(ctx.is_supported)() {
return Err(ChecksumError::Unsupported {
checksum: ChecksumType::Fletcher2,
implementation: implementation.to_str(),
});
}
Ok(Fletcher2 {
buffer_fill: 0,
buffer: [0; FLETCHER_2_BLOCK_SIZE * FLETCHER_2_MAX_SIMD_WIDTH],
state: Default::default(),
order: EndianOrder::Little,
impl_ctx: ctx,
update_blocks: ctx.update_blocks_little,
})
}
fn finish_blocks_single_stream(state: &[u64]) -> [u64; FLETCHER_2_U64_COUNT] {
[state[0], state[1], state[2], state[3]]
}
fn finish_blocks_dual_stream(state: &[u64]) -> [u64; FLETCHER_2_U64_COUNT] {
let a0 = state[0];
let b0 = state[1];
let a1 = state[2];
let b1 = state[3];
let c0 = state[4];
let d0 = state[5];
let c1 = state[6];
let d1 = state[7];
let ra = a0.wrapping_add(a1);
let rb = b0.wrapping_add(b1);
let rc = c0.wrapping_add(c1).wrapping_mul(2).wrapping_sub(a1);
let rd = d0.wrapping_add(d1).wrapping_mul(2).wrapping_sub(b1);
[ra, rb, rc, rd]
}
fn finish_blocks_quad_stream(state: &[u64]) -> [u64; FLETCHER_2_U64_COUNT] {
let a0 = state[0];
let b0 = state[1];
let a1 = state[2];
let b1 = state[3];
let a2 = state[4];
let b2 = state[5];
let a3 = state[6];
let b3 = state[7];
let c0 = state[8];
let d0 = state[9];
let c1 = state[10];
let d1 = state[11];
let c2 = state[12];
let d2 = state[13];
let c3 = state[14];
let d3 = state[15];
let ra = a0.wrapping_add(a1).wrapping_add(a2).wrapping_add(a3);
let rb = b0.wrapping_add(b1).wrapping_add(b2).wrapping_add(b3);
let rc = c0
.wrapping_add(c1)
.wrapping_add(c2)
.wrapping_add(c3)
.wrapping_mul(4)
.wrapping_sub(
a1.wrapping_add(a2.wrapping_mul(2))
.wrapping_add(a3.wrapping_mul(3)),
);
let rd = d0
.wrapping_add(d1)
.wrapping_add(d2)
.wrapping_add(d3)
.wrapping_mul(4)
.wrapping_sub(
b1.wrapping_add(b2.wrapping_mul(2))
.wrapping_add(b3.wrapping_mul(3)),
);
[ra, rb, rc, rd]
}
fn update_blocks_generic_big(state: &mut [u64], data: &[u8]) {
let mut a = state[0];
let mut b = state[1];
let mut c = state[2];
let mut d = state[3];
let mut iter = data.chunks_exact(FLETCHER_2_BLOCK_SIZE);
for block in iter.by_ref() {
let v = u64::from_be_bytes(block[0..8].try_into().unwrap());
let w = u64::from_be_bytes(block[8..16].try_into().unwrap());
a = a.wrapping_add(v);
b = b.wrapping_add(w);
c = c.wrapping_add(a);
d = d.wrapping_add(b);
}
state[0] = a;
state[1] = b;
state[2] = c;
state[3] = d;
}
fn update_blocks_generic_little(state: &mut [u64], data: &[u8]) {
let mut a = state[0];
let mut b = state[1];
let mut c = state[2];
let mut d = state[3];
let mut iter = data.chunks_exact(FLETCHER_2_BLOCK_SIZE);
for block in iter.by_ref() {
let v = u64::from_le_bytes(block[0..8].try_into().unwrap());
let w = u64::from_le_bytes(block[8..16].try_into().unwrap());
a = a.wrapping_add(v);
b = b.wrapping_add(w);
c = c.wrapping_add(a);
d = d.wrapping_add(b);
}
state[0] = a;
state[1] = b;
state[2] = c;
state[3] = d;
}
fn update_blocks_superscalar2_big(state: &mut [u64], data: &[u8]) {
let mut a0 = state[0];
let mut b0 = state[1];
let mut a1 = state[2];
let mut b1 = state[3];
let mut c0 = state[4];
let mut d0 = state[5];
let mut c1 = state[6];
let mut d1 = state[7];
let mut iter = data.chunks_exact(2 * FLETCHER_2_BLOCK_SIZE);
for block in iter.by_ref() {
let v = u64::from_be_bytes(block[0..8].try_into().unwrap());
let w = u64::from_be_bytes(block[8..16].try_into().unwrap());
let x = u64::from_be_bytes(block[16..24].try_into().unwrap());
let y = u64::from_be_bytes(block[24..32].try_into().unwrap());
a0 = a0.wrapping_add(v);
b0 = b0.wrapping_add(w);
a1 = a1.wrapping_add(x);
b1 = b1.wrapping_add(y);
c0 = c0.wrapping_add(a0);
d0 = d0.wrapping_add(b0);
c1 = c1.wrapping_add(a1);
d1 = d1.wrapping_add(b1);
}
state[0] = a0;
state[1] = b0;
state[2] = a1;
state[3] = b1;
state[4] = c0;
state[5] = d0;
state[6] = c1;
state[7] = d1;
}
fn update_blocks_superscalar2_little(state: &mut [u64], data: &[u8]) {
let mut a0 = state[0];
let mut b0 = state[1];
let mut a1 = state[2];
let mut b1 = state[3];
let mut c0 = state[4];
let mut d0 = state[5];
let mut c1 = state[6];
let mut d1 = state[7];
let mut iter = data.chunks_exact(2 * FLETCHER_2_BLOCK_SIZE);
for block in iter.by_ref() {
let v = u64::from_le_bytes(block[0..8].try_into().unwrap());
let w = u64::from_le_bytes(block[8..16].try_into().unwrap());
let x = u64::from_le_bytes(block[16..24].try_into().unwrap());
let y = u64::from_le_bytes(block[24..32].try_into().unwrap());
a0 = a0.wrapping_add(v);
b0 = b0.wrapping_add(w);
a1 = a1.wrapping_add(x);
b1 = b1.wrapping_add(y);
c0 = c0.wrapping_add(a0);
d0 = d0.wrapping_add(b0);
c1 = c1.wrapping_add(a1);
d1 = d1.wrapping_add(b1);
}
state[0] = a0;
state[1] = b0;
state[2] = a1;
state[3] = b1;
state[4] = c0;
state[5] = d0;
state[6] = c1;
state[7] = d1;
}
fn update_blocks_superscalar4_big(state: &mut [u64], data: &[u8]) {
let mut a0 = state[0];
let mut b0 = state[1];
let mut a1 = state[2];
let mut b1 = state[3];
let mut a2 = state[4];
let mut b2 = state[5];
let mut a3 = state[6];
let mut b3 = state[7];
let mut c0 = state[8];
let mut d0 = state[9];
let mut c1 = state[10];
let mut d1 = state[11];
let mut c2 = state[12];
let mut d2 = state[13];
let mut c3 = state[14];
let mut d3 = state[15];
let mut iter = data.chunks_exact(4 * FLETCHER_2_BLOCK_SIZE);
for block in iter.by_ref() {
let v = u64::from_be_bytes(block[0..8].try_into().unwrap());
let w = u64::from_be_bytes(block[8..16].try_into().unwrap());
let x = u64::from_be_bytes(block[16..24].try_into().unwrap());
let y = u64::from_be_bytes(block[24..32].try_into().unwrap());
let vv = u64::from_be_bytes(block[32..40].try_into().unwrap());
let ww = u64::from_be_bytes(block[40..48].try_into().unwrap());
let xx = u64::from_be_bytes(block[48..56].try_into().unwrap());
let yy = u64::from_be_bytes(block[56..64].try_into().unwrap());
a0 = a0.wrapping_add(v);
b0 = b0.wrapping_add(w);
a1 = a1.wrapping_add(x);
b1 = b1.wrapping_add(y);
a2 = a2.wrapping_add(vv);
b2 = b2.wrapping_add(ww);
a3 = a3.wrapping_add(xx);
b3 = b3.wrapping_add(yy);
c0 = c0.wrapping_add(a0);
d0 = d0.wrapping_add(b0);
c1 = c1.wrapping_add(a1);
d1 = d1.wrapping_add(b1);
c2 = c2.wrapping_add(a2);
d2 = d2.wrapping_add(b2);
c3 = c3.wrapping_add(a3);
d3 = d3.wrapping_add(b3);
}
state[0] = a0;
state[1] = b0;
state[2] = a1;
state[3] = b1;
state[4] = a2;
state[5] = b2;
state[6] = a3;
state[7] = b3;
state[8] = c0;
state[9] = d0;
state[10] = c1;
state[11] = d1;
state[12] = c2;
state[13] = d2;
state[14] = c3;
state[15] = d3;
}
fn update_blocks_superscalar4_little(state: &mut [u64], data: &[u8]) {
let mut a0 = state[0];
let mut b0 = state[1];
let mut a1 = state[2];
let mut b1 = state[3];
let mut a2 = state[4];
let mut b2 = state[5];
let mut a3 = state[6];
let mut b3 = state[7];
let mut c0 = state[8];
let mut d0 = state[9];
let mut c1 = state[10];
let mut d1 = state[11];
let mut c2 = state[12];
let mut d2 = state[13];
let mut c3 = state[14];
let mut d3 = state[15];
let mut iter = data.chunks_exact(4 * FLETCHER_2_BLOCK_SIZE);
for block in iter.by_ref() {
let v = u64::from_le_bytes(block[0..8].try_into().unwrap());
let w = u64::from_le_bytes(block[8..16].try_into().unwrap());
let x = u64::from_le_bytes(block[16..24].try_into().unwrap());
let y = u64::from_le_bytes(block[24..32].try_into().unwrap());
let vv = u64::from_le_bytes(block[32..40].try_into().unwrap());
let ww = u64::from_le_bytes(block[40..48].try_into().unwrap());
let xx = u64::from_le_bytes(block[48..56].try_into().unwrap());
let yy = u64::from_le_bytes(block[56..64].try_into().unwrap());
a0 = a0.wrapping_add(v);
b0 = b0.wrapping_add(w);
a1 = a1.wrapping_add(x);
b1 = b1.wrapping_add(y);
a2 = a2.wrapping_add(vv);
b2 = b2.wrapping_add(ww);
a3 = a3.wrapping_add(xx);
b3 = b3.wrapping_add(yy);
c0 = c0.wrapping_add(a0);
d0 = d0.wrapping_add(b0);
c1 = c1.wrapping_add(a1);
d1 = d1.wrapping_add(b1);
c2 = c2.wrapping_add(a2);
d2 = d2.wrapping_add(b2);
c3 = c3.wrapping_add(a3);
d3 = d3.wrapping_add(b3);
}
state[0] = a0;
state[1] = b0;
state[2] = a1;
state[3] = b1;
state[4] = a2;
state[5] = b2;
state[6] = a3;
state[7] = b3;
state[8] = c0;
state[9] = d0;
state[10] = c1;
state[11] = d1;
state[12] = c2;
state[13] = d2;
state[14] = c3;
state[15] = d3;
}
#[cfg(all(
feature = "fletcher2-sse2",
any(target_arch = "x86", target_arch = "x86_64")
))]
fn update_blocks_sse2_byteswap(state: &mut [u64], data: &[u8]) {
#[target_feature(enable = "sse2")]
unsafe fn update_blocks_sse2_byteswap_impl(state: &mut [u64], data: &[u8]) {
let state = state.as_ptr() as *mut arch::__m128i;
let mut ab = arch::_mm_loadu_si128(state.add(0));
let mut cd = arch::_mm_loadu_si128(state.add(1));
let mut iter = data.chunks_exact(FLETCHER_2_BLOCK_SIZE);
for block in iter.by_ref() {
let v = u64::from_ne_bytes(block[0..8].try_into().unwrap()).swap_bytes();
let w = u64::from_ne_bytes(block[8..16].try_into().unwrap()).swap_bytes();
let block: &[u64; 2] = &[v, w];
let vw = arch::_mm_loadu_si128(block.as_ptr() as *const _);
ab = arch::_mm_add_epi64(ab, vw);
cd = arch::_mm_add_epi64(cd, ab);
}
arch::_mm_storeu_si128(state.add(0), ab);
arch::_mm_storeu_si128(state.add(1), cd);
}
unsafe { update_blocks_sse2_byteswap_impl(state, data) }
}
#[cfg(all(
any(feature = "fletcher2-sse2", feature = "fletcher2-ssse3"),
any(target_arch = "x86", target_arch = "x86_64")
))]
fn update_blocks_sse2_native(state: &mut [u64], data: &[u8]) {
#[target_feature(enable = "sse2")]
unsafe fn update_blocks_sse2_native_impl(state: &mut [u64], data: &[u8]) {
let state = state.as_ptr() as *mut arch::__m128i;
let mut ab = arch::_mm_loadu_si128(state.add(0));
let mut cd = arch::_mm_loadu_si128(state.add(1));
let mut iter = data.chunks_exact(FLETCHER_2_BLOCK_SIZE);
for block in iter.by_ref() {
let vw = arch::_mm_loadu_si128(block.as_ptr() as *const _);
ab = arch::_mm_add_epi64(ab, vw);
cd = arch::_mm_add_epi64(cd, ab);
}
arch::_mm_storeu_si128(state.add(0), ab);
arch::_mm_storeu_si128(state.add(1), cd);
}
unsafe { update_blocks_sse2_native_impl(state, data) }
}
#[cfg(all(
feature = "fletcher2-ssse3",
any(target_arch = "x86", target_arch = "x86_64")
))]
fn update_blocks_ssse3_byteswap(state: &mut [u64], data: &[u8]) {
#[target_feature(enable = "sse2,ssse3")]
unsafe fn update_blocks_ssse3_byteswap_impl(state: &mut [u64], data: &[u8]) {
let state = state.as_ptr() as *mut arch::__m128i;
let mut ab = arch::_mm_loadu_si128(state.add(0));
let mut cd = arch::_mm_loadu_si128(state.add(1));
let shuffle = arch::_mm_set_epi8(
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, );
let mut iter = data.chunks_exact(FLETCHER_2_BLOCK_SIZE);
for block in iter.by_ref() {
let vw = arch::_mm_loadu_si128(block.as_ptr() as *const _);
let vw = arch::_mm_shuffle_epi8(vw, shuffle);
ab = arch::_mm_add_epi64(ab, vw);
cd = arch::_mm_add_epi64(cd, ab);
}
arch::_mm_storeu_si128(state.add(0), ab);
arch::_mm_storeu_si128(state.add(1), cd);
}
unsafe { update_blocks_ssse3_byteswap_impl(state, data) }
}
#[cfg(all(
feature = "fletcher2-avx2",
any(target_arch = "x86", target_arch = "x86_64")
))]
fn update_blocks_avx2_byteswap(state: &mut [u64], data: &[u8]) {
#[target_feature(enable = "avx,avx2")]
unsafe fn update_blocks_avx2_byteswap_impl(state: &mut [u64], data: &[u8]) {
let shuffle = arch::_mm256_set_epi8(
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, );
let state = state.as_ptr() as *mut arch::__m256i;
let mut ab = arch::_mm256_lddqu_si256(state.add(0));
let mut cd = arch::_mm256_lddqu_si256(state.add(1));
let mut iter = data.chunks_exact(2 * FLETCHER_2_BLOCK_SIZE);
for block in iter.by_ref() {
let vwxy = arch::_mm256_lddqu_si256(block.as_ptr() as *const _);
let vwxy = arch::_mm256_shuffle_epi8(vwxy, shuffle);
ab = arch::_mm256_add_epi64(ab, vwxy);
cd = arch::_mm256_add_epi64(cd, ab);
}
arch::_mm256_storeu_si256(state.add(0), ab);
arch::_mm256_storeu_si256(state.add(1), cd);
}
unsafe { update_blocks_avx2_byteswap_impl(state, data) }
}
#[cfg(all(
feature = "fletcher2-avx2",
any(target_arch = "x86", target_arch = "x86_64")
))]
fn update_blocks_avx2_native(state: &mut [u64], data: &[u8]) {
#[target_feature(enable = "avx,avx2")]
unsafe fn update_blocks_avx2_native_impl(state: &mut [u64], data: &[u8]) {
let state = state.as_ptr() as *mut arch::__m256i;
let mut ab = arch::_mm256_lddqu_si256(state.add(0));
let mut cd = arch::_mm256_lddqu_si256(state.add(1));
let mut iter = data.chunks_exact(2 * FLETCHER_2_BLOCK_SIZE);
for block in iter.by_ref() {
let vwxy = arch::_mm256_lddqu_si256(block.as_ptr() as *const _);
ab = arch::_mm256_add_epi64(ab, vwxy);
cd = arch::_mm256_add_epi64(cd, ab);
}
arch::_mm256_storeu_si256(state.add(0), ab);
arch::_mm256_storeu_si256(state.add(1), cd);
}
unsafe { update_blocks_avx2_native_impl(state, data) }
}
#[cfg(all(
any(target_arch = "x86", target_arch = "x86_64",),
feature = "fletcher2-avx512f",
))]
fn update_blocks_avx512f_byteswap(state: &mut [u64], data: &[u8]) {
#[target_feature(enable = "avx512f")]
unsafe fn update_blocks_avx512f_byteswap_impl(state: &mut [u64], data: &[u8]) {
let state = state.as_ptr() as *mut arch::__m512i;
let mut ab = arch::_mm512_loadu_si512(state.add(0));
let mut cd = arch::_mm512_loadu_si512(state.add(16));
let mut iter = data.chunks_exact(4 * FLETCHER_2_BLOCK_SIZE);
let mask0 = arch::_mm512_maskz_set1_epi64(0xff, 0xff);
let mask1 = arch::_mm512_slli_epi64(mask0, 8);
let mask2 = arch::_mm512_slli_epi64(mask0, 16);
let mask3 = arch::_mm512_slli_epi64(mask0, 24);
let mask4 = arch::_mm512_slli_epi64(mask0, 32);
let mask5 = arch::_mm512_slli_epi64(mask0, 40);
let mask6 = arch::_mm512_slli_epi64(mask0, 48);
let mask7 = arch::_mm512_slli_epi64(mask0, 56);
for block in iter.by_ref() {
let values = arch::_mm512_loadu_si512(block.as_ptr() as *const _);
let s0 = arch::_mm512_and_epi64(values, mask0);
let s1 = arch::_mm512_and_epi64(values, mask1);
let s2 = arch::_mm512_and_epi64(values, mask2);
let s3 = arch::_mm512_and_epi64(values, mask3);
let s4 = arch::_mm512_and_epi64(values, mask4);
let s5 = arch::_mm512_and_epi64(values, mask5);
let s6 = arch::_mm512_and_epi64(values, mask6);
let s7 = arch::_mm512_and_epi64(values, mask7);
let s0 = arch::_mm512_slli_epi64(s0, 56);
let s1 = arch::_mm512_slli_epi64(s1, 40);
let s2 = arch::_mm512_slli_epi64(s2, 24);
let s3 = arch::_mm512_slli_epi64(s3, 8);
let s4 = arch::_mm512_srli_epi64(s4, 8);
let s5 = arch::_mm512_srli_epi64(s5, 24);
let s6 = arch::_mm512_srli_epi64(s6, 40);
let s7 = arch::_mm512_srli_epi64(s7, 56);
let s01 = arch::_mm512_or_epi64(s0, s1);
let s23 = arch::_mm512_or_epi64(s2, s3);
let s45 = arch::_mm512_or_epi64(s4, s5);
let s67 = arch::_mm512_or_epi64(s6, s7);
let s03 = arch::_mm512_or_epi64(s01, s23);
let s47 = arch::_mm512_or_epi64(s45, s67);
let values = arch::_mm512_or_epi64(s03, s47);
ab = arch::_mm512_add_epi64(ab, values);
cd = arch::_mm512_add_epi64(cd, ab);
}
arch::_mm512_storeu_si512(state.add(0), ab);
arch::_mm512_storeu_si512(state.add(1), cd);
}
unsafe { update_blocks_avx512f_byteswap_impl(state, data) }
}
#[cfg(all(
any(target_arch = "x86", target_arch = "x86_64",),
any(feature = "fletcher2-avx512f", feature = "fletcher2-avx512bw"),
))]
fn update_blocks_avx512f_native(state: &mut [u64], data: &[u8]) {
#[target_feature(enable = "avx512f")]
unsafe fn update_blocks_avx512f_native_impl(state: &mut [u64], data: &[u8]) {
let state = state.as_ptr() as *mut arch::__m512i;
let mut ab = arch::_mm512_loadu_si512(state.add(0));
let mut cd = arch::_mm512_loadu_si512(state.add(16));
let mut iter = data.chunks_exact(4 * FLETCHER_2_BLOCK_SIZE);
for block in iter.by_ref() {
let values = arch::_mm512_loadu_si512(block.as_ptr() as *const _);
ab = arch::_mm512_add_epi64(ab, values);
cd = arch::_mm512_add_epi64(cd, ab);
}
arch::_mm512_storeu_si512(state.add(0), ab);
arch::_mm512_storeu_si512(state.add(1), cd);
}
unsafe { update_blocks_avx512f_native_impl(state, data) }
}
#[cfg(all(
any(target_arch = "x86", target_arch = "x86_64",),
feature = "fletcher2-avx512bw",
))]
fn update_blocks_avx512bw_byteswap(state: &mut [u64], data: &[u8]) {
#[target_feature(enable = "avx512f,avx512bw")]
unsafe fn update_blocks_avx512bw_byteswap_impl(state: &mut [u64], data: &[u8]) {
let shuffle = arch::_mm512_set_epi8(
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, );
let state = state.as_ptr() as *mut arch::__m512i;
let mut ab = arch::_mm512_loadu_si512(state.add(0));
let mut cd = arch::_mm512_loadu_si512(state.add(16));
let mut iter = data.chunks_exact(4 * FLETCHER_2_BLOCK_SIZE);
for block in iter.by_ref() {
let values = arch::_mm512_loadu_si512(block.as_ptr() as *const _);
let values = arch::_mm512_shuffle_epi8(values, shuffle);
ab = arch::_mm512_add_epi64(ab, values);
cd = arch::_mm512_add_epi64(cd, ab);
}
arch::_mm512_storeu_si512(state.add(0), ab);
arch::_mm512_storeu_si512(state.add(1), cd);
}
unsafe { update_blocks_avx512bw_byteswap_impl(state, data) }
}
}
impl Checksum for Fletcher2 {
fn reset(&mut self, order: EndianOrder) -> Result<(), ChecksumError> {
self.buffer_fill = 0;
self.buffer = [0; FLETCHER_2_BLOCK_SIZE * FLETCHER_2_MAX_SIMD_WIDTH];
self.state = Default::default();
self.order = order;
self.update_blocks = match self.order {
EndianOrder::Big => self.impl_ctx.update_blocks_big,
EndianOrder::Little => self.impl_ctx.update_blocks_little,
};
Ok(())
}
fn update(&mut self, data: &[u8]) -> Result<(), ChecksumError> {
let mut data = data;
if self.buffer_fill > 0 {
let todo = cmp::min(self.impl_ctx.block_size - self.buffer_fill, data.len());
self.buffer[self.buffer_fill..self.buffer_fill + todo].copy_from_slice(&data[0..todo]);
self.buffer_fill += todo;
data = &data[todo..];
if self.buffer_fill == self.impl_ctx.block_size {
let full_blocks_data = &self.buffer[0..self.buffer_fill];
(self.update_blocks)(&mut self.state, full_blocks_data);
self.buffer_fill = 0;
}
}
let remainder = data.len() % self.impl_ctx.block_size;
let full_blocks_data = &data[0..data.len() - remainder];
(self.update_blocks)(&mut self.state, full_blocks_data);
if remainder > 0 {
self.buffer[0..remainder].copy_from_slice(&data[data.len() - remainder..]);
self.buffer_fill = remainder;
}
Ok(())
}
fn finalize(&mut self) -> Result<[u64; 4], ChecksumError> {
let mut result = (self.impl_ctx.finish_blocks)(&self.state);
let remainder = self.buffer_fill % FLETCHER_2_BLOCK_SIZE;
let full_block_bytes = self.buffer_fill - remainder;
if full_block_bytes > 0 {
let generic = match self.order {
EndianOrder::Big => Fletcher2::update_blocks_generic_big,
EndianOrder::Little => Fletcher2::update_blocks_generic_little,
};
(generic)(&mut result, &self.buffer[0..full_block_bytes]);
result = Fletcher2::finish_blocks_single_stream(&result);
}
Ok(result)
}
fn hash(&mut self, data: &[u8], order: EndianOrder) -> Result<[u64; 4], ChecksumError> {
self.reset(order)?;
self.update(data)?;
self.finalize()
}
}