use archmage::prelude::*;
const DIVISOR: u32 = 65521;
const MAX_CHUNK_LEN: usize = 5552;
#[must_use]
#[allow(unexpected_cfgs)]
pub fn adler32(adler: u32, data: &[u8]) -> u32 {
#[cfg(feature = "avx512")]
{
incant!(adler32_impl(adler, data), [v4x, v4, v3, neon, wasm128])
}
#[cfg(not(feature = "avx512"))]
{
incant!(adler32_impl(adler, data), [v3, neon, wasm128])
}
}
#[must_use]
pub fn adler32_combine(adler1: u32, adler2: u32, len2: usize) -> u32 {
let s1_1 = adler1 & 0xFFFF;
let s2_1 = adler1 >> 16;
let s1_2 = adler2 & 0xFFFF;
let s2_2 = adler2 >> 16;
let s1 = (s1_1 + s1_2 + DIVISOR - 1) % DIVISOR;
let rem = (len2 % DIVISOR as usize) as u32;
let s2 = (s2_1 + s2_2 + rem * s1_1 + DIVISOR * 2 - rem) % DIVISOR;
(s2 << 16) | s1
}
#[cfg(all(target_arch = "x86_64", feature = "avx512"))]
#[arcane]
#[allow(clippy::incompatible_msrv)]
fn adler32_impl_v4x(_token: X64V4xToken, adler: u32, data: &[u8]) -> u32 {
const VL: usize = 64;
const MAX_SIMD_CHUNK: usize = MAX_CHUNK_LEN & !(4 * VL - 1);
#[repr(align(64))]
struct Aligned64([i8; 64]);
static MULTS: Aligned64 = Aligned64([
64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42,
41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19,
18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
]);
let mults = _mm512_loadu_si512(&MULTS.0);
let ones = _mm512_set1_epi8(1);
let zeroes = _mm512_setzero_si512();
let mut s1 = adler & 0xFFFF;
let mut s2 = adler >> 16;
let mut remaining = data;
while !remaining.is_empty() {
let n = remaining.len().min(MAX_SIMD_CHUNK);
let (chunk, rest) = remaining.split_at(n);
remaining = rest;
let mut p = chunk;
if p.len() >= 4 * VL {
let mut v_s1_a = zeroes;
let mut v_s1_b = zeroes;
let mut v_s1_c = zeroes;
let mut v_s1_d = zeroes;
let mut v_s2_a = zeroes;
let mut v_s2_b = zeroes;
let mut v_s2_c = zeroes;
let mut v_s2_d = zeroes;
let mut v_s1_sums_a = zeroes;
let mut v_s1_sums_b = zeroes;
let mut v_s1_sums_c = zeroes;
let mut v_s1_sums_d = zeroes;
let vectorized_len = p.len() & !(4 * VL - 1);
s2 += s1 * vectorized_len as u32;
while p.len() >= 4 * VL {
let data_a: &[u8; 64] = p[..64].try_into().unwrap();
let data_b: &[u8; 64] = p[64..128].try_into().unwrap();
let data_c: &[u8; 64] = p[128..192].try_into().unwrap();
let data_d: &[u8; 64] = p[192..256].try_into().unwrap();
let va = _mm512_loadu_si512(data_a);
let vb = _mm512_loadu_si512(data_b);
let vc = _mm512_loadu_si512(data_c);
let vd = _mm512_loadu_si512(data_d);
v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, v_s1_a);
v_s1_sums_b = _mm512_add_epi32(v_s1_sums_b, v_s1_b);
v_s1_sums_c = _mm512_add_epi32(v_s1_sums_c, v_s1_c);
v_s1_sums_d = _mm512_add_epi32(v_s1_sums_d, v_s1_d);
v_s2_a = _mm512_dpbusd_epi32(v_s2_a, va, mults);
v_s2_b = _mm512_dpbusd_epi32(v_s2_b, vb, mults);
v_s2_c = _mm512_dpbusd_epi32(v_s2_c, vc, mults);
v_s2_d = _mm512_dpbusd_epi32(v_s2_d, vd, mults);
v_s1_a = _mm512_dpbusd_epi32(v_s1_a, va, ones);
v_s1_b = _mm512_dpbusd_epi32(v_s1_b, vb, ones);
v_s1_c = _mm512_dpbusd_epi32(v_s1_c, vc, ones);
v_s1_d = _mm512_dpbusd_epi32(v_s1_d, vd, ones);
p = &p[4 * VL..];
}
let tmp0 = _mm512_add_epi32(v_s1_a, v_s1_b);
let tmp1 = _mm512_add_epi32(v_s1_a, v_s1_c);
let total_s1_sums = _mm512_add_epi32(
_mm512_add_epi32(v_s1_sums_a, v_s1_sums_b),
_mm512_add_epi32(v_s1_sums_c, v_s1_sums_d),
);
let v_s1 = _mm512_add_epi32(_mm512_add_epi32(tmp0, v_s1_c), v_s1_d);
let v_s2 = {
let cross_iter = _mm512_slli_epi32(total_s1_sums, 8); let pos_2vl = _mm512_slli_epi32(tmp0, 7); let pos_vl = _mm512_slli_epi32(tmp1, 6); let sum_s2 = _mm512_add_epi32(
_mm512_add_epi32(v_s2_a, v_s2_b),
_mm512_add_epi32(v_s2_c, v_s2_d),
);
_mm512_add_epi32(
_mm512_add_epi32(cross_iter, sum_s2),
_mm512_add_epi32(pos_2vl, pos_vl),
)
};
let v_s1_256 = _mm256_add_epi32(
_mm512_extracti64x4_epi64(v_s1, 0),
_mm512_extracti64x4_epi64(v_s1, 1),
);
let v_s2_256 = _mm256_add_epi32(
_mm512_extracti64x4_epi64(v_s2, 0),
_mm512_extracti64x4_epi64(v_s2, 1),
);
let mut s1_128 = _mm_add_epi32(
_mm256_castsi256_si128(v_s1_256),
_mm256_extracti128_si256(v_s1_256, 1),
);
let mut s2_128 = _mm_add_epi32(
_mm256_castsi256_si128(v_s2_256),
_mm256_extracti128_si256(v_s2_256, 1),
);
s1_128 = _mm_add_epi32(s1_128, _mm_shuffle_epi32(s1_128, 0x31));
s2_128 = _mm_add_epi32(s2_128, _mm_shuffle_epi32(s2_128, 0x31));
s1_128 = _mm_add_epi32(s1_128, _mm_shuffle_epi32(s1_128, 0x02));
s2_128 = _mm_add_epi32(s2_128, _mm_shuffle_epi32(s2_128, 0x02));
s1 += _mm_cvtsi128_si32(s1_128) as u32;
s2 += _mm_cvtsi128_si32(s2_128) as u32;
}
adler32_chunk_scalar(&mut s1, &mut s2, p);
}
(s2 << 16) | s1
}
#[cfg(all(target_arch = "x86_64", feature = "avx512"))]
#[arcane]
#[allow(clippy::incompatible_msrv)]
fn adler32_impl_v4(_token: X64V4Token, adler: u32, data: &[u8]) -> u32 {
const VL: usize = 64;
const MAX_SIMD_CHUNK: usize = {
let limit = 2 * VL * (i16::MAX as usize / u8::MAX as usize);
let m = if limit < MAX_CHUNK_LEN {
limit
} else {
MAX_CHUNK_LEN
};
m & !(2 * VL - 1)
};
#[repr(align(64))]
struct Aligned([i16; 32]);
static MULTS_A: Aligned = Aligned([
128, 127, 126, 125, 124, 123, 122, 121, 112, 111, 110, 109, 108, 107, 106, 105, 96, 95, 94,
93, 92, 91, 90, 89, 80, 79, 78, 77, 76, 75, 74, 73,
]);
static MULTS_B: Aligned = Aligned([
120, 119, 118, 117, 116, 115, 114, 113, 104, 103, 102, 101, 100, 99, 98, 97, 88, 87, 86,
85, 84, 83, 82, 81, 72, 71, 70, 69, 68, 67, 66, 65,
]);
static MULTS_C: Aligned = Aligned([
64, 63, 62, 61, 60, 59, 58, 57, 48, 47, 46, 45, 44, 43, 42, 41, 32, 31, 30, 29, 28, 27, 26,
25, 16, 15, 14, 13, 12, 11, 10, 9,
]);
static MULTS_D: Aligned = Aligned([
56, 55, 54, 53, 52, 51, 50, 49, 40, 39, 38, 37, 36, 35, 34, 33, 24, 23, 22, 21, 20, 19, 18,
17, 8, 7, 6, 5, 4, 3, 2, 1,
]);
let mults_a = _mm512_loadu_si512(&MULTS_A.0);
let mults_b = _mm512_loadu_si512(&MULTS_B.0);
let mults_c = _mm512_loadu_si512(&MULTS_C.0);
let mults_d = _mm512_loadu_si512(&MULTS_D.0);
let zeroes = _mm512_setzero_si512();
let mut s1 = adler & 0xFFFF;
let mut s2 = adler >> 16;
let mut remaining = data;
while !remaining.is_empty() {
let n = remaining.len().min(MAX_SIMD_CHUNK);
let (chunk, rest) = remaining.split_at(n);
remaining = rest;
let mut p = chunk;
if p.len() >= 2 * VL {
let mut v_s1 = zeroes;
let mut v_s1_sums = zeroes;
let mut v_byte_sums_a = zeroes;
let mut v_byte_sums_b = zeroes;
let mut v_byte_sums_c = zeroes;
let mut v_byte_sums_d = zeroes;
let vectorized_len = p.len() & !(2 * VL - 1);
s2 += s1 * vectorized_len as u32;
while p.len() >= 2 * VL {
let data_a: &[u8; 64] = p[..64].try_into().unwrap();
let data_b: &[u8; 64] = p[64..128].try_into().unwrap();
let va = _mm512_loadu_si512(data_a);
let vb = _mm512_loadu_si512(data_b);
v_s1_sums = _mm512_add_epi32(v_s1_sums, v_s1);
v_byte_sums_a = _mm512_add_epi16(v_byte_sums_a, _mm512_unpacklo_epi8(va, zeroes));
v_byte_sums_b = _mm512_add_epi16(v_byte_sums_b, _mm512_unpackhi_epi8(va, zeroes));
v_byte_sums_c = _mm512_add_epi16(v_byte_sums_c, _mm512_unpacklo_epi8(vb, zeroes));
v_byte_sums_d = _mm512_add_epi16(v_byte_sums_d, _mm512_unpackhi_epi8(vb, zeroes));
let sad_a = _mm512_sad_epu8(va, zeroes);
let sad_b = _mm512_sad_epu8(vb, zeroes);
v_s1 = _mm512_add_epi32(v_s1, _mm512_add_epi32(sad_a, sad_b));
p = &p[2 * VL..];
}
let v_s2 = {
let weighted_sums = _mm512_slli_epi32(v_s1_sums, 7); let ma = _mm512_madd_epi16(v_byte_sums_a, mults_a);
let mb = _mm512_madd_epi16(v_byte_sums_b, mults_b);
let mc = _mm512_madd_epi16(v_byte_sums_c, mults_c);
let md = _mm512_madd_epi16(v_byte_sums_d, mults_d);
let sum_ab = _mm512_add_epi32(ma, mb);
let sum_cd = _mm512_add_epi32(mc, md);
_mm512_add_epi32(weighted_sums, _mm512_add_epi32(sum_ab, sum_cd))
};
let v_s1_256 = _mm256_add_epi32(
_mm512_extracti64x4_epi64(v_s1, 0),
_mm512_extracti64x4_epi64(v_s1, 1),
);
let v_s2_256 = _mm256_add_epi32(
_mm512_extracti64x4_epi64(v_s2, 0),
_mm512_extracti64x4_epi64(v_s2, 1),
);
let mut s1_128 = _mm_add_epi32(
_mm256_castsi256_si128(v_s1_256),
_mm256_extracti128_si256(v_s1_256, 1),
);
let mut s2_128 = _mm_add_epi32(
_mm256_castsi256_si128(v_s2_256),
_mm256_extracti128_si256(v_s2_256, 1),
);
s2_128 = _mm_add_epi32(s2_128, _mm_shuffle_epi32(s2_128, 0x31));
s1_128 = _mm_add_epi32(s1_128, _mm_shuffle_epi32(s1_128, 0x02));
s2_128 = _mm_add_epi32(s2_128, _mm_shuffle_epi32(s2_128, 0x02));
s1 += _mm_cvtsi128_si32(s1_128) as u32;
s2 += _mm_cvtsi128_si32(s2_128) as u32;
}
adler32_chunk_scalar(&mut s1, &mut s2, p);
}
(s2 << 16) | s1
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn adler32_impl_v3(_token: Desktop64, adler: u32, data: &[u8]) -> u32 {
const VL: usize = 32;
const MAX_SIMD_CHUNK: usize = {
let limit = 2 * VL * (i16::MAX as usize / u8::MAX as usize);
let m = if limit < MAX_CHUNK_LEN {
limit
} else {
MAX_CHUNK_LEN
};
m & !(2 * VL - 1)
};
#[repr(align(32))]
struct Aligned([i16; 16]);
static MULTS_A: Aligned = Aligned([
64, 63, 62, 61, 60, 59, 58, 57, 48, 47, 46, 45, 44, 43, 42, 41,
]);
static MULTS_B: Aligned = Aligned([
56, 55, 54, 53, 52, 51, 50, 49, 40, 39, 38, 37, 36, 35, 34, 33,
]);
static MULTS_C: Aligned = Aligned([
32, 31, 30, 29, 28, 27, 26, 25, 16, 15, 14, 13, 12, 11, 10, 9,
]);
static MULTS_D: Aligned = Aligned([24, 23, 22, 21, 20, 19, 18, 17, 8, 7, 6, 5, 4, 3, 2, 1]);
let mults_a = _mm256_loadu_si256(&MULTS_A.0);
let mults_b = _mm256_loadu_si256(&MULTS_B.0);
let mults_c = _mm256_loadu_si256(&MULTS_C.0);
let mults_d = _mm256_loadu_si256(&MULTS_D.0);
let zeroes = _mm256_setzero_si256();
let mut s1 = adler & 0xFFFF;
let mut s2 = adler >> 16;
let mut remaining = data;
while !remaining.is_empty() {
let n = remaining.len().min(MAX_SIMD_CHUNK);
let (chunk, rest) = remaining.split_at(n);
remaining = rest;
let mut p = chunk;
if p.len() >= 2 * VL {
let mut v_s1 = zeroes;
let mut v_s1_sums = zeroes;
let mut v_byte_sums_a = zeroes;
let mut v_byte_sums_b = zeroes;
let mut v_byte_sums_c = zeroes;
let mut v_byte_sums_d = zeroes;
let vectorized_len = p.len() & !(2 * VL - 1);
s2 += s1 * vectorized_len as u32;
while p.len() >= 2 * VL {
let data_a: &[u8; 32] = p[..32].try_into().unwrap();
let data_b: &[u8; 32] = p[32..64].try_into().unwrap();
let va = _mm256_loadu_si256(data_a);
let vb = _mm256_loadu_si256(data_b);
v_s1_sums = _mm256_add_epi32(v_s1_sums, v_s1);
v_byte_sums_a = _mm256_add_epi16(v_byte_sums_a, _mm256_unpacklo_epi8(va, zeroes));
v_byte_sums_b = _mm256_add_epi16(v_byte_sums_b, _mm256_unpackhi_epi8(va, zeroes));
v_byte_sums_c = _mm256_add_epi16(v_byte_sums_c, _mm256_unpacklo_epi8(vb, zeroes));
v_byte_sums_d = _mm256_add_epi16(v_byte_sums_d, _mm256_unpackhi_epi8(vb, zeroes));
let sad_a = _mm256_sad_epu8(va, zeroes);
let sad_b = _mm256_sad_epu8(vb, zeroes);
v_s1 = _mm256_add_epi32(v_s1, _mm256_add_epi32(sad_a, sad_b));
p = &p[2 * VL..];
}
let v_s2 = {
let weighted_sums = _mm256_slli_epi32(v_s1_sums, 6); let ma = _mm256_madd_epi16(v_byte_sums_a, mults_a);
let mb = _mm256_madd_epi16(v_byte_sums_b, mults_b);
let mc = _mm256_madd_epi16(v_byte_sums_c, mults_c);
let md = _mm256_madd_epi16(v_byte_sums_d, mults_d);
let sum_ab = _mm256_add_epi32(ma, mb);
let sum_cd = _mm256_add_epi32(mc, md);
_mm256_add_epi32(weighted_sums, _mm256_add_epi32(sum_ab, sum_cd))
};
let s1_lo = _mm256_castsi256_si128(v_s1);
let s1_hi = _mm256_extracti128_si256(v_s1, 1);
let mut s1_128 = _mm_add_epi32(s1_lo, s1_hi);
let s2_lo = _mm256_castsi256_si128(v_s2);
let s2_hi = _mm256_extracti128_si256(v_s2, 1);
let mut s2_128 = _mm_add_epi32(s2_lo, s2_hi);
s2_128 = _mm_add_epi32(s2_128, _mm_shuffle_epi32(s2_128, 0x31));
s1_128 = _mm_add_epi32(s1_128, _mm_shuffle_epi32(s1_128, 0x02));
s2_128 = _mm_add_epi32(s2_128, _mm_shuffle_epi32(s2_128, 0x02));
s1 += _mm_cvtsi128_si32(s1_128) as u32;
s2 += _mm_cvtsi128_si32(s2_128) as u32;
}
adler32_chunk_scalar(&mut s1, &mut s2, p);
}
(s2 << 16) | s1
}
#[cfg(target_arch = "aarch64")]
#[arcane]
fn adler32_impl_neon(_token: NeonToken, adler: u32, data: &[u8]) -> u32 {
static MULTS_A: [u16; 8] = [64, 63, 62, 61, 60, 59, 58, 57];
static MULTS_B: [u16; 8] = [56, 55, 54, 53, 52, 51, 50, 49];
static MULTS_C: [u16; 8] = [48, 47, 46, 45, 44, 43, 42, 41];
static MULTS_D: [u16; 8] = [40, 39, 38, 37, 36, 35, 34, 33];
static MULTS_E: [u16; 8] = [32, 31, 30, 29, 28, 27, 26, 25];
static MULTS_F: [u16; 8] = [24, 23, 22, 21, 20, 19, 18, 17];
static MULTS_G: [u16; 8] = [16, 15, 14, 13, 12, 11, 10, 9];
static MULTS_H: [u16; 8] = [8, 7, 6, 5, 4, 3, 2, 1];
let mults_a = vld1q_u16(&MULTS_A);
let mults_b = vld1q_u16(&MULTS_B);
let mults_c = vld1q_u16(&MULTS_C);
let mults_d = vld1q_u16(&MULTS_D);
let mults_e = vld1q_u16(&MULTS_E);
let mults_f = vld1q_u16(&MULTS_F);
let mults_g = vld1q_u16(&MULTS_G);
let mults_h = vld1q_u16(&MULTS_H);
let mut s1 = adler & 0xFFFF;
let mut s2 = adler >> 16;
let mut remaining = data;
while !remaining.is_empty() {
let n = remaining.len().min(MAX_CHUNK_LEN & !63);
let (chunk, rest) = remaining.split_at(n);
remaining = rest;
let mut p = chunk;
if p.len() >= 64 {
let mut v_s1 = vdupq_n_u32(0);
let mut v_s2 = vdupq_n_u32(0);
let mut v_byte_sums_a = vdupq_n_u16(0);
let mut v_byte_sums_b = vdupq_n_u16(0);
let mut v_byte_sums_c = vdupq_n_u16(0);
let mut v_byte_sums_d = vdupq_n_u16(0);
let mut v_byte_sums_e = vdupq_n_u16(0);
let mut v_byte_sums_f = vdupq_n_u16(0);
let mut v_byte_sums_g = vdupq_n_u16(0);
let mut v_byte_sums_h = vdupq_n_u16(0);
let vectorized_len = p.len() & !63;
s2 += s1 * vectorized_len as u32;
while p.len() >= 64 {
let data_a: &[u8; 16] = p[0..16].try_into().unwrap();
let data_b: &[u8; 16] = p[16..32].try_into().unwrap();
let data_c: &[u8; 16] = p[32..48].try_into().unwrap();
let data_d: &[u8; 16] = p[48..64].try_into().unwrap();
let data_a = vld1q_u8(data_a);
let data_b = vld1q_u8(data_b);
let data_c = vld1q_u8(data_c);
let data_d = vld1q_u8(data_d);
v_s2 = vaddq_u32(v_s2, v_s1);
let mut tmp = vpaddlq_u8(data_a);
v_byte_sums_a = vaddw_u8(v_byte_sums_a, vget_low_u8(data_a));
v_byte_sums_b = vaddw_u8(v_byte_sums_b, vget_high_u8(data_a));
tmp = vpadalq_u8(tmp, data_b);
v_byte_sums_c = vaddw_u8(v_byte_sums_c, vget_low_u8(data_b));
v_byte_sums_d = vaddw_u8(v_byte_sums_d, vget_high_u8(data_b));
tmp = vpadalq_u8(tmp, data_c);
v_byte_sums_e = vaddw_u8(v_byte_sums_e, vget_low_u8(data_c));
v_byte_sums_f = vaddw_u8(v_byte_sums_f, vget_high_u8(data_c));
tmp = vpadalq_u8(tmp, data_d);
v_byte_sums_g = vaddw_u8(v_byte_sums_g, vget_low_u8(data_d));
v_byte_sums_h = vaddw_u8(v_byte_sums_h, vget_high_u8(data_d));
v_s1 = vpadalq_u16(v_s1, tmp);
p = &p[64..];
}
v_s2 = vqshlq_n_u32::<6>(v_s2);
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_a), vget_low_u16(mults_a));
v_s2 = vmlal_high_u16(v_s2, v_byte_sums_a, mults_a);
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_b), vget_low_u16(mults_b));
v_s2 = vmlal_high_u16(v_s2, v_byte_sums_b, mults_b);
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_c), vget_low_u16(mults_c));
v_s2 = vmlal_high_u16(v_s2, v_byte_sums_c, mults_c);
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_d), vget_low_u16(mults_d));
v_s2 = vmlal_high_u16(v_s2, v_byte_sums_d, mults_d);
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_e), vget_low_u16(mults_e));
v_s2 = vmlal_high_u16(v_s2, v_byte_sums_e, mults_e);
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_f), vget_low_u16(mults_f));
v_s2 = vmlal_high_u16(v_s2, v_byte_sums_f, mults_f);
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_g), vget_low_u16(mults_g));
v_s2 = vmlal_high_u16(v_s2, v_byte_sums_g, mults_g);
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_h), vget_low_u16(mults_h));
v_s2 = vmlal_high_u16(v_s2, v_byte_sums_h, mults_h);
s1 += vaddvq_u32(v_s1);
s2 += vaddvq_u32(v_s2);
}
adler32_chunk_scalar(&mut s1, &mut s2, p);
}
(s2 << 16) | s1
}
#[cfg(target_arch = "wasm32")]
#[arcane]
fn adler32_impl_wasm128(_token: Wasm128Token, adler: u32, data: &[u8]) -> u32 {
const VL: usize = 16;
const MAX_SIMD_CHUNK: usize = {
let limit = 2 * VL * (i16::MAX as usize / u8::MAX as usize);
let m = if limit < MAX_CHUNK_LEN {
limit
} else {
MAX_CHUNK_LEN
};
m & !(2 * VL - 1)
};
static MULTS_A: [i16; 8] = [32, 31, 30, 29, 28, 27, 26, 25];
static MULTS_B: [i16; 8] = [24, 23, 22, 21, 20, 19, 18, 17];
static MULTS_C: [i16; 8] = [16, 15, 14, 13, 12, 11, 10, 9];
static MULTS_D: [i16; 8] = [8, 7, 6, 5, 4, 3, 2, 1];
let mults_a = v128_load(&MULTS_A);
let mults_b = v128_load(&MULTS_B);
let mults_c = v128_load(&MULTS_C);
let mults_d = v128_load(&MULTS_D);
let zeroes = i32x4_splat(0);
let mut s1 = adler & 0xFFFF;
let mut s2 = adler >> 16;
let mut remaining = data;
while !remaining.is_empty() {
let n = remaining.len().min(MAX_SIMD_CHUNK);
let (chunk, rest) = remaining.split_at(n);
remaining = rest;
let mut p = chunk;
if p.len() >= 2 * VL {
let mut v_s1 = zeroes;
let mut v_s1_sums = zeroes;
let mut v_byte_sums_a = i16x8_splat(0);
let mut v_byte_sums_b = i16x8_splat(0);
let mut v_byte_sums_c = i16x8_splat(0);
let mut v_byte_sums_d = i16x8_splat(0);
let vectorized_len = p.len() & !(2 * VL - 1);
s2 += s1 * vectorized_len as u32;
while p.len() >= 2 * VL {
let data_a: &[u8; 16] = p[..16].try_into().unwrap();
let data_b: &[u8; 16] = p[16..32].try_into().unwrap();
let va = v128_load(data_a);
let vb = v128_load(data_b);
v_s1_sums = i32x4_add(v_s1_sums, v_s1);
v_byte_sums_a = i16x8_add(v_byte_sums_a, i16x8_extend_low_u8x16(va));
v_byte_sums_b = i16x8_add(v_byte_sums_b, i16x8_extend_high_u8x16(va));
v_byte_sums_c = i16x8_add(v_byte_sums_c, i16x8_extend_low_u8x16(vb));
v_byte_sums_d = i16x8_add(v_byte_sums_d, i16x8_extend_high_u8x16(vb));
let sum_a = i32x4_extadd_pairwise_i16x8(i16x8_extadd_pairwise_u8x16(va));
let sum_b = i32x4_extadd_pairwise_i16x8(i16x8_extadd_pairwise_u8x16(vb));
v_s1 = i32x4_add(v_s1, i32x4_add(sum_a, sum_b));
p = &p[2 * VL..];
}
let v_s2 = {
let weighted_sums = i32x4_shl(v_s1_sums, 5); let ma = i32x4_dot_i16x8(v_byte_sums_a, mults_a);
let mb = i32x4_dot_i16x8(v_byte_sums_b, mults_b);
let mc = i32x4_dot_i16x8(v_byte_sums_c, mults_c);
let md = i32x4_dot_i16x8(v_byte_sums_d, mults_d);
let sum_ab = i32x4_add(ma, mb);
let sum_cd = i32x4_add(mc, md);
i32x4_add(weighted_sums, i32x4_add(sum_ab, sum_cd))
};
s1 += (i32x4_extract_lane::<0>(v_s1)
+ i32x4_extract_lane::<1>(v_s1)
+ i32x4_extract_lane::<2>(v_s1)
+ i32x4_extract_lane::<3>(v_s1)) as u32;
s2 += (i32x4_extract_lane::<0>(v_s2)
+ i32x4_extract_lane::<1>(v_s2)
+ i32x4_extract_lane::<2>(v_s2)
+ i32x4_extract_lane::<3>(v_s2)) as u32;
}
adler32_chunk_scalar(&mut s1, &mut s2, p);
}
(s2 << 16) | s1
}
fn adler32_impl_scalar(_token: ScalarToken, adler: u32, data: &[u8]) -> u32 {
let mut s1 = adler & 0xFFFF;
let mut s2 = adler >> 16;
let mut remaining = data;
while !remaining.is_empty() {
let chunk_len = remaining.len().min(MAX_CHUNK_LEN & !3);
let (chunk, rest) = remaining.split_at(chunk_len);
remaining = rest;
adler32_chunk_scalar(&mut s1, &mut s2, chunk);
}
(s2 << 16) | s1
}
fn adler32_chunk_scalar(s1: &mut u32, s2: &mut u32, data: &[u8]) {
let mut p = data;
if p.len() >= 4 {
let mut s1_sum: u32 = 0;
let mut byte_0_sum: u32 = 0;
let mut byte_1_sum: u32 = 0;
let mut byte_2_sum: u32 = 0;
let mut byte_3_sum: u32 = 0;
while p.len() >= 4 {
s1_sum += *s1;
*s1 += p[0] as u32 + p[1] as u32 + p[2] as u32 + p[3] as u32;
byte_0_sum += p[0] as u32;
byte_1_sum += p[1] as u32;
byte_2_sum += p[2] as u32;
byte_3_sum += p[3] as u32;
p = &p[4..];
}
*s2 += 4 * (s1_sum + byte_0_sum) + 3 * byte_1_sum + 2 * byte_2_sum + byte_3_sum;
}
for &b in p {
*s1 += b as u32;
*s2 += *s1;
}
*s1 %= DIVISOR;
*s2 %= DIVISOR;
}
#[derive(Clone, Debug)]
pub struct Adler32Hasher {
checksum: u32,
amount: u64,
}
impl Adler32Hasher {
pub fn new() -> Self {
Self {
checksum: 1,
amount: 0,
}
}
pub fn from_checksum(checksum: u32) -> Self {
Self {
checksum,
amount: 0,
}
}
pub fn write(&mut self, data: &[u8]) {
self.checksum = adler32(self.checksum, data);
self.amount += data.len() as u64;
}
#[must_use]
pub fn finish(&self) -> u32 {
self.checksum
}
pub fn reset(&mut self) {
self.checksum = 1;
self.amount = 0;
}
pub fn combine(&mut self, other: &Self) {
self.checksum = adler32_combine(self.checksum, other.checksum, other.amount as usize);
self.amount += other.amount;
}
pub fn amount(&self) -> u64 {
self.amount
}
}
impl Default for Adler32Hasher {
fn default() -> Self {
Self::new()
}
}
impl core::hash::Hasher for Adler32Hasher {
fn finish(&self) -> u64 {
self.checksum as u64
}
fn write(&mut self, bytes: &[u8]) {
Adler32Hasher::write(self, bytes);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_initial_value() {
assert_eq!(adler32(1, &[]), 1);
}
#[test]
fn test_single_byte() {
assert_eq!(adler32(1, &[0]), (1 << 16) | 1);
assert_eq!(adler32(1, &[1]), (2 << 16) | 2);
assert_eq!(adler32(1, &[255]), (256 << 16) | 256);
}
#[test]
#[cfg_attr(miri, ignore)]
fn test_known_values() {
let data = b"Hello World";
let result = adler32(1, data);
assert_eq!(result, libdeflater::adler32(data));
}
#[test]
fn test_incremental() {
let data = b"Hello World";
let full = adler32(1, data);
let partial = adler32(1, &data[..5]);
let incremental = adler32(partial, &data[5..]);
assert_eq!(full, incremental);
}
#[test]
fn hasher_new_write_finish() {
let mut h = Adler32Hasher::new();
h.write(b"Hello");
h.write(b" World");
assert_eq!(h.finish(), adler32(1, b"Hello World"));
assert_eq!(h.amount(), 11);
}
#[test]
fn hasher_default() {
let h = Adler32Hasher::default();
assert_eq!(h.finish(), 1);
assert_eq!(h.amount(), 0);
}
#[test]
fn hasher_from_checksum() {
let partial = adler32(1, b"Hello");
let mut h = Adler32Hasher::from_checksum(partial);
h.write(b" World");
assert_eq!(h.finish(), adler32(1, b"Hello World"));
}
#[test]
fn hasher_reset() {
let mut h = Adler32Hasher::new();
h.write(b"data");
h.reset();
assert_eq!(h.finish(), 1);
assert_eq!(h.amount(), 0);
}
#[test]
fn hasher_combine() {
let mut h1 = Adler32Hasher::new();
h1.write(b"Hello, ");
let mut h2 = Adler32Hasher::new();
h2.write(b"World!");
h1.combine(&h2);
assert_eq!(h1.finish(), adler32(1, b"Hello, World!"));
assert_eq!(h1.amount(), 13);
}
#[test]
fn hasher_core_hash_hasher_trait() {
use core::hash::Hasher;
let mut h = Adler32Hasher::new();
Hasher::write(&mut h, b"Hello World");
assert_eq!(Hasher::finish(&h), adler32(1, b"Hello World") as u64);
}
#[test]
fn hasher_clone() {
let mut h = Adler32Hasher::new();
h.write(b"Hello");
let h2 = h.clone();
assert_eq!(h.finish(), h2.finish());
assert_eq!(h.amount(), h2.amount());
}
#[test]
fn hasher_empty_write() {
let mut h = Adler32Hasher::new();
h.write(b"");
assert_eq!(h.finish(), 1);
assert_eq!(h.amount(), 0);
}
}
#[cfg(all(test, not(miri)))]
mod parity {
use super::*;
fn check_parity(data: &[u8]) {
let ours = adler32(1, data);
let theirs = libdeflater::adler32(data);
assert_eq!(ours, theirs, "adler32 mismatch for {} bytes", data.len());
}
fn check_parity_incremental(data: &[u8], split: usize) {
let split = split.min(data.len());
let ours = {
let a = adler32(1, &data[..split]);
adler32(a, &data[split..])
};
let theirs = libdeflater::adler32(data);
assert_eq!(
ours,
theirs,
"incremental adler32 mismatch for {} bytes split at {}",
data.len(),
split
);
}
#[test]
fn parity_empty() {
check_parity(&[]);
}
#[test]
fn parity_single_byte() {
for b in 0..=255u8 {
check_parity(&[b]);
}
}
#[test]
fn parity_all_zeros() {
for &len in &[1, 100, 5552, 65536] {
check_parity(&alloc::vec![0u8; len]);
}
}
#[test]
fn parity_all_ff() {
for &len in &[1, 100, 5552, 65536] {
check_parity(&alloc::vec![0xFFu8; len]);
}
}
#[test]
fn parity_sequential() {
let data: alloc::vec::Vec<u8> = (0..=255).cycle().take(100_000).collect();
check_parity(&data);
}
#[test]
fn parity_chunk_boundary() {
for len in [5550, 5551, 5552, 5553, 5554, 11104, 11105] {
let data: alloc::vec::Vec<u8> = (0..=255).cycle().take(len).collect();
check_parity(&data);
}
}
#[test]
fn parity_incremental() {
let data: alloc::vec::Vec<u8> = (0..=255).cycle().take(20_000).collect();
for &split in &[0, 1, 100, 5552, 10000, 20000] {
check_parity_incremental(&data, split);
}
}
#[test]
fn parity_large() {
let data: alloc::vec::Vec<u8> = (0..=255).cycle().take(1_000_000).collect();
check_parity(&data);
}
#[test]
fn test_adler32_combine_basic() {
let data1 = b"Hello, ";
let data2 = b"World!";
let full = b"Hello, World!";
let adler_full = super::adler32(1, full);
let a1 = super::adler32(1, data1);
let a2 = super::adler32(1, data2);
let combined = super::adler32_combine(a1, a2, data2.len());
assert_eq!(combined, adler_full);
}
#[test]
fn test_adler32_combine_large() {
let data: alloc::vec::Vec<u8> = (0..=255).cycle().take(100_000).collect();
for split in [1, 100, 1000, 32768, 50000, 99999] {
let (a, b) = data.split_at(split);
let adler_full = super::adler32(1, &data);
let a1 = super::adler32(1, a);
let a2 = super::adler32(1, b);
let combined = super::adler32_combine(a1, a2, b.len());
assert_eq!(combined, adler_full, "failed at split={split}");
}
}
#[test]
fn test_adler32_combine_empty() {
let data = b"test data";
let adler = super::adler32(1, data);
assert_eq!(super::adler32_combine(adler, 1, 0), adler);
}
#[test]
fn hasher_parity_with_libdeflater() {
let data: alloc::vec::Vec<u8> = (0..=255).cycle().take(100_000).collect();
let expected = libdeflater::adler32(&data);
let mut h = Adler32Hasher::new();
h.write(&data);
assert_eq!(h.finish(), expected);
let mut h = Adler32Hasher::new();
for chunk in data.chunks(1337) {
h.write(chunk);
}
assert_eq!(h.finish(), expected);
let (a, b) = data.split_at(50_000);
let mut h1 = Adler32Hasher::new();
h1.write(a);
let mut h2 = Adler32Hasher::new();
h2.write(b);
h1.combine(&h2);
assert_eq!(h1.finish(), expected);
}
#[test]
fn adler32_all_simd_tiers() {
use archmage::testing::{CompileTimePolicy, for_each_token_permutation};
let sizes = [0, 1, 15, 16, 31, 32, 63, 64, 128, 256, 5552, 5553, 100_000];
let reference: alloc::vec::Vec<u32> = sizes
.iter()
.map(|&sz| {
let data: alloc::vec::Vec<u8> = (0..=255u8).cycle().take(sz).collect();
libdeflater::adler32(&data)
})
.collect();
let report = for_each_token_permutation(CompileTimePolicy::Warn, |perm| {
for (i, &sz) in sizes.iter().enumerate() {
let data: alloc::vec::Vec<u8> = (0..=255u8).cycle().take(sz).collect();
let result = super::adler32(1, &data);
assert_eq!(
result, reference[i],
"adler32 mismatch at size={sz}, tier: {perm}"
);
}
});
eprintln!("adler32 permutation test: {report}");
}
}