zenflate 0.3.0 - Docs.rs

//! CRC-32 checksum for gzip. Scalar slice-by-8 and SIMD folding
//! implementations via archmage.
//!
//! Uses SIMD acceleration when available via archmage:
//! - VPCLMULQDQ 512-bit folding (x86_64-v4x: AVX-512 + VPCLMULQDQ)
//! - PCLMULQDQ 128-bit folding (X64CryptoToken: SSE4.2 + PCLMULQDQ + AES)
//! - ARM PMULL folding + hardware CRC-32 (Arm64-v2: NEON + AES + CRC)
//! - ARM PMULL folding (NeonAes: NEON + AES, scalar tail)
//! - Scalar slice-by-8 fallback

use archmage::prelude::*;

use super::tables::CRC32_SLICE8_TABLE;

/// Compute the CRC-32 checksum of `data`, continuing from `crc`.
///
/// To compute from scratch, pass `crc = 0`. To continue a running
/// checksum, pass the previous return value.
///
/// This matches `libdeflate_crc32` semantics: the internal CRC state
/// is inverted before and after processing.
///
/// ```
/// use zenflate::crc32;
///
/// let checksum = crc32(0, b"Hello");
/// // Continue with more data:
/// let checksum = crc32(checksum, b" World");
/// ```
#[must_use]
#[allow(unexpected_cfgs)]
pub fn crc32(crc: u32, data: &[u8]) -> u32 {
    if data.is_empty() {
        return crc;
    }
    #[cfg(feature = "avx512")]
    {
        !incant!(crc32_impl(!crc, data), [v4x, x64_crypto, neon_aes])
    }
    #[cfg(not(feature = "avx512"))]
    {
        !incant!(crc32_impl(!crc, data), [x64_crypto, neon_aes])
    }
}

/// Combine two CRC-32 checksums.
///
/// Given `crc1 = crc32(0, data1)` and `crc2 = crc32(0, data2)`, returns
/// `crc32(0, data1 || data2)` in O(log(len2)) time without needing the
/// original data. Used for parallel checksum computation.
///
/// CRC-32 is linear over GF(2), so
/// `crc(A||B) = crc(A) * x^(8*len(B)) + crc(B)` in the CRC polynomial ring.
/// The multiplication by `x^n mod G(x)` is computed via matrix repeated squaring.
#[must_use]
pub fn crc32_combine(crc1: u32, crc2: u32, len2: usize) -> u32 {
    if len2 == 0 {
        return crc1;
    }

    let mut even = [0u32; 32];
    let mut odd = [0u32; 32];

    // Operator for one zero bit: CRC-32 polynomial shift
    odd[0] = CRC32_POLY;
    let mut row = 1u32;
    for item in &mut odd[1..] {
        *item = row;
        row <<= 1;
    }

    // Square to get operator for two zero bits
    gf2_matrix_square(&mut even, &odd);
    // Square again for four zero bits
    gf2_matrix_square(&mut odd, &even);

    // Apply 8*len2 zero bits using binary decomposition of len2.
    // Starting from the operator for 4 zero bits, each loop iteration
    // doubles: first square gives 8 bits (1 byte), then 16, 32, etc.
    let mut result = crc1;
    let mut n = len2;
    loop {
        gf2_matrix_square(&mut even, &odd);
        if n & 1 != 0 {
            result = gf2_matrix_times(&even, result);
        }
        n >>= 1;
        if n == 0 {
            break;
        }

        gf2_matrix_square(&mut odd, &even);
        if n & 1 != 0 {
            result = gf2_matrix_times(&odd, result);
        }
        n >>= 1;
        if n == 0 {
            break;
        }
    }

    result ^ crc2
}

/// CRC-32 polynomial (bit-reversed).
const CRC32_POLY: u32 = 0xEDB88320;

/// Multiply a vector by a GF(2) matrix (32x32 over GF(2)).
fn gf2_matrix_times(mat: &[u32; 32], mut vec: u32) -> u32 {
    let mut sum = 0u32;
    let mut i = 0;
    while vec != 0 {
        if vec & 1 != 0 {
            sum ^= mat[i];
        }
        vec >>= 1;
        i += 1;
    }
    sum
}

/// Square a GF(2) matrix.
fn gf2_matrix_square(square: &mut [u32; 32], mat: &[u32; 32]) {
    for n in 0..32 {
        square[n] = gf2_matrix_times(mat, mat[n]);
    }
}

// ---------------------------------------------------------------------------
// CRC-32 fold constants: x^N mod G(x) where G is the gzip CRC-32 polynomial.
// From libdeflate's crc32_multipliers.h (generated by gen-crc32-consts.py).
// ---------------------------------------------------------------------------

// For VL=64 (512-bit): 8v folds across 8*512 = 4096 bits
const CRC32_X4127_MODG: i64 = 0x1072db28_u32 as i64;
const CRC32_X4063_MODG: i64 = 0x0c30f51d_u32 as i64;

// For VL=64 4v / VL=32 8v: folds across 2048 bits
const CRC32_X2079_MODG: i64 = 0xce3371cb_u32 as i64;
const CRC32_X2015_MODG: i64 = 0xe95c1271_u32 as i64;

// Shared across vector lengths
const CRC32_X1055_MODG: i64 = 0x33fff533;
const CRC32_X991_MODG: i64 = 0x910eeec1;
const CRC32_X543_MODG: i64 = 0x8f352d95;
const CRC32_X479_MODG: i64 = 0x1d9513d7;
const CRC32_X287_MODG: i64 = 0xf1da05aa;
const CRC32_X223_MODG: i64 = 0x81256527;
const CRC32_X159_MODG: i64 = 0xae689191;
const CRC32_X95_MODG: i64 = 0xccaa009e;

const CRC32_BARRETT_1: i64 = 0xb4e5b025f7011641u64 as i64; // floor(x^95 / G(x))
const CRC32_BARRETT_2: i64 = 0x00000001db710641u64 as i64; // G(x)

/// fold128(src, dst, mults) = dst XOR clmul(src_lo, mults_lo) XOR clmul(src_hi, mults_hi)
macro_rules! fold128 {
    ($src:expr, $dst:expr, $mults:expr) => {{
        let src = $src;
        let mults = $mults;
        _mm_xor_si128(
            _mm_xor_si128($dst, _mm_clmulepi64_si128(src, mults, 0x00)),
            _mm_clmulepi64_si128(src, mults, 0x11),
        )
    }};
}

/// Barrett reduction: 128-bit polynomial → 32-bit CRC.
///
/// Matches the kernel's crc-pclmul-template.S with n=32 and LSB_CRC=1.
macro_rules! barrett_reduce {
    ($x0:expr, $mults_128b:expr, $barrett:expr) => {{
        // Fold 128 → 64 bits: multiply low half by x^64 and XOR with high half
        let x0 = _mm_xor_si128(
            _mm_clmulepi64_si128($x0, $mults_128b, 0x10),
            _mm_bsrli_si128::<8>($x0),
        );
        // Barrett reduction: two clmul steps
        let x1 = _mm_clmulepi64_si128(x0, $barrett, 0x00);
        let x1 = _mm_clmulepi64_si128(x1, $barrett, 0x10);
        _mm_extract_epi32::<2>(_mm_xor_si128(x0, x1)) as u32
    }};
}

// ---------------------------------------------------------------------------
// VPCLMULQDQ 512-bit CRC-32 folding (X64V4xToken: VPCLMULQDQ + AVX-512)
//
// Processes four 128-bit lanes in parallel per 512-bit vector. 8 accumulators
// (8*64=512 bytes per main loop iteration). Reduces via 256-bit and 128-bit
// stages to Barrett reduction.
// ---------------------------------------------------------------------------
#[cfg(all(target_arch = "x86_64", feature = "avx512"))]
#[arcane]
#[allow(clippy::incompatible_msrv)] // avx512 feature requires rustc 1.89+
fn crc32_impl_v4x(_token: X64V4xToken, crc: u32, data: &[u8]) -> u32 {
    let len = data.len();

    // Need at least 64 bytes for 512-bit path
    if len < 64 {
        return crc32_slice8(crc, data);
    }

    // 512-bit multiplier vectors (replicated across four 128-bit lanes)
    let mults_8v = _mm512_set_epi64(
        CRC32_X4063_MODG,
        CRC32_X4127_MODG,
        CRC32_X4063_MODG,
        CRC32_X4127_MODG,
        CRC32_X4063_MODG,
        CRC32_X4127_MODG,
        CRC32_X4063_MODG,
        CRC32_X4127_MODG,
    );
    let mults_4v = _mm512_set_epi64(
        CRC32_X2015_MODG,
        CRC32_X2079_MODG,
        CRC32_X2015_MODG,
        CRC32_X2079_MODG,
        CRC32_X2015_MODG,
        CRC32_X2079_MODG,
        CRC32_X2015_MODG,
        CRC32_X2079_MODG,
    );
    let mults_2v = _mm512_set_epi64(
        CRC32_X991_MODG,
        CRC32_X1055_MODG,
        CRC32_X991_MODG,
        CRC32_X1055_MODG,
        CRC32_X991_MODG,
        CRC32_X1055_MODG,
        CRC32_X991_MODG,
        CRC32_X1055_MODG,
    );
    let mults_1v = _mm512_set_epi64(
        CRC32_X479_MODG,
        CRC32_X543_MODG,
        CRC32_X479_MODG,
        CRC32_X543_MODG,
        CRC32_X479_MODG,
        CRC32_X543_MODG,
        CRC32_X479_MODG,
        CRC32_X543_MODG,
    );
    // 256-bit constants for 512→256 reduction (256-bit distance between halves)
    let mults_256b = _mm256_set_epi64x(
        CRC32_X223_MODG,
        CRC32_X287_MODG,
        CRC32_X223_MODG,
        CRC32_X287_MODG,
    );
    // 128-bit constants for 256→128 and Barrett reduction
    let mults_128b = _mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG);
    let barrett = _mm_set_epi64x(CRC32_BARRETT_2, CRC32_BARRETT_1);

    /// fold512 — VPCLMULQDQ + vpternlog across four 128-bit lanes
    macro_rules! fold512 {
        ($src:expr, $dst:expr, $mults:expr) => {{
            let src = $src;
            let mults = $mults;
            _mm512_ternarylogic_epi32(
                _mm512_clmulepi64_epi128(src, mults, 0x00),
                _mm512_clmulepi64_epi128(src, mults, 0x11),
                $dst,
                0x96,
            )
        }};
    }

    /// fold256 — VPCLMULQDQ + vpternlog across two 128-bit lanes
    macro_rules! fold256 {
        ($src:expr, $dst:expr, $mults:expr) => {{
            let src = $src;
            let mults = $mults;
            _mm256_ternarylogic_epi32(
                _mm256_clmulepi64_epi128(src, mults, 0x00),
                _mm256_clmulepi64_epi128(src, mults, 0x11),
                $dst,
                0x96,
            )
        }};
    }

    #[inline(always)]
    fn ld64(data: &[u8], off: usize) -> &[u8; 64] {
        data[off..off + 64].try_into().unwrap()
    }
    #[inline(always)]
    fn ld32(data: &[u8], off: usize) -> &[u8; 32] {
        data[off..off + 32].try_into().unwrap()
    }
    #[inline(always)]
    fn ld16(data: &[u8], off: usize) -> &[u8; 16] {
        data[off..off + 16].try_into().unwrap()
    }

    let mut p = data;

    // Initialize: load first 64 bytes, XOR CRC into low 32 bits
    let crc_v = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, crc as i32);
    let mut x0 = _mm512_xor_si512(_mm512_loadu_si512(ld64(p, 0)), crc_v);
    p = &p[64..];

    if p.len() >= 448 {
        // 8-accumulator path: need 7 more 64-byte vectors (448 bytes)
        let mut v0 = x0;
        let mut v1 = _mm512_loadu_si512(ld64(p, 0));
        let mut v2 = _mm512_loadu_si512(ld64(p, 64));
        let mut v3 = _mm512_loadu_si512(ld64(p, 128));
        let mut v4 = _mm512_loadu_si512(ld64(p, 192));
        let mut v5 = _mm512_loadu_si512(ld64(p, 256));
        let mut v6 = _mm512_loadu_si512(ld64(p, 320));
        let mut v7 = _mm512_loadu_si512(ld64(p, 384));
        p = &p[448..];

        // Main loop: 512 bytes per iteration (8 vectors × 64 bytes)
        while p.len() >= 512 {
            v0 = fold512!(v0, _mm512_loadu_si512(ld64(p, 0)), mults_8v);
            v1 = fold512!(v1, _mm512_loadu_si512(ld64(p, 64)), mults_8v);
            v2 = fold512!(v2, _mm512_loadu_si512(ld64(p, 128)), mults_8v);
            v3 = fold512!(v3, _mm512_loadu_si512(ld64(p, 192)), mults_8v);
            v4 = fold512!(v4, _mm512_loadu_si512(ld64(p, 256)), mults_8v);
            v5 = fold512!(v5, _mm512_loadu_si512(ld64(p, 320)), mults_8v);
            v6 = fold512!(v6, _mm512_loadu_si512(ld64(p, 384)), mults_8v);
            v7 = fold512!(v7, _mm512_loadu_si512(ld64(p, 448)), mults_8v);
            p = &p[512..];
        }

        // Fold 8 → 4
        v0 = fold512!(v0, v4, mults_4v);
        v1 = fold512!(v1, v5, mults_4v);
        v2 = fold512!(v2, v6, mults_4v);
        v3 = fold512!(v3, v7, mults_4v);
        if p.len() >= 256 {
            v0 = fold512!(v0, _mm512_loadu_si512(ld64(p, 0)), mults_4v);
            v1 = fold512!(v1, _mm512_loadu_si512(ld64(p, 64)), mults_4v);
            v2 = fold512!(v2, _mm512_loadu_si512(ld64(p, 128)), mults_4v);
            v3 = fold512!(v3, _mm512_loadu_si512(ld64(p, 192)), mults_4v);
            p = &p[256..];
        }

        // Fold 4 → 2
        v0 = fold512!(v0, v2, mults_2v);
        v1 = fold512!(v1, v3, mults_2v);
        if p.len() >= 128 {
            v0 = fold512!(v0, _mm512_loadu_si512(ld64(p, 0)), mults_2v);
            v1 = fold512!(v1, _mm512_loadu_si512(ld64(p, 64)), mults_2v);
            p = &p[128..];
        }

        // Fold 2 → 1
        x0 = fold512!(v0, v1, mults_1v);
        if p.len() >= 64 {
            x0 = fold512!(x0, _mm512_loadu_si512(ld64(p, 0)), mults_1v);
            p = &p[64..];
        }
    } else {
        // Small path: fold one 64-byte chunk at a time
        while p.len() >= 64 {
            x0 = fold512!(x0, _mm512_loadu_si512(ld64(p, 0)), mults_1v);
            p = &p[64..];
        }
    }

    // Reduce 512→256: fold lo 256 into hi 256 (256-bit distance)
    let x256_lo = _mm512_castsi512_si256(x0);
    let x256_hi = _mm512_extracti64x4_epi64(x0, 1);
    let mut y0 = fold256!(x256_lo, x256_hi, mults_256b);

    // Handle remaining 32-byte chunk
    if p.len() >= 32 {
        y0 = fold256!(y0, _mm256_loadu_si256(ld32(p, 0)), mults_256b);
        p = &p[32..];
    }

    // Reduce 256→128: fold lo 128 into hi 128 (128-bit distance)
    let x128_lo = _mm256_castsi256_si128(y0);
    let x128_hi = _mm256_extracti128_si256(y0, 1);
    let mut x128 = fold128!(x128_lo, x128_hi, mults_128b);

    // Handle remaining 16-byte chunk
    if p.len() >= 16 {
        x128 = fold128!(x128, _mm_loadu_si128(ld16(p, 0)), mults_128b);
        p = &p[16..];
    }

    // Handle remaining 1-15 bytes with scalar
    if !p.is_empty() {
        let partial = barrett_reduce!(x128, mults_128b, barrett);
        return crc32_slice8(partial, p);
    }

    barrett_reduce!(x128, mults_128b, barrett)
}

// ---------------------------------------------------------------------------
// PCLMULQDQ 128-bit CRC-32 folding (X64CryptoToken: SSE4.2 + PCLMULQDQ + AES)
// ---------------------------------------------------------------------------
#[cfg(target_arch = "x86_64")]
#[arcane]
fn crc32_impl_x64_crypto(_token: X64CryptoToken, crc: u32, data: &[u8]) -> u32 {
    let len = data.len();

    // PCLMULQDQ needs at least 16 bytes
    if len < 16 {
        return crc32_slice8(crc, data);
    }

    #[inline(always)]
    fn chunk16(data: &[u8], offset: usize) -> &[u8; 16] {
        data[offset..offset + 16].try_into().unwrap()
    }

    // Multiplier vectors (VL=16)
    let mults_8v = _mm_set_epi64x(CRC32_X991_MODG, CRC32_X1055_MODG);
    let mults_4v = _mm_set_epi64x(CRC32_X479_MODG, CRC32_X543_MODG);
    let mults_2v = _mm_set_epi64x(CRC32_X223_MODG, CRC32_X287_MODG);
    let mults_1v = _mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG);
    let barrett = _mm_set_epi64x(CRC32_BARRETT_2, CRC32_BARRETT_1);

    // Initialize: load first 16 bytes, XOR CRC into low 32 bits
    let mut x0 = _mm_xor_si128(
        _mm_loadu_si128(chunk16(data, 0)),
        _mm_cvtsi32_si128(crc as i32),
    );
    let mut pos = 16;

    if len >= 128 {
        // 8-accumulator path: load 8 vectors (128 bytes)
        let mut v0 = x0;
        let mut v1 = _mm_loadu_si128(chunk16(data, 16));
        let mut v2 = _mm_loadu_si128(chunk16(data, 32));
        let mut v3 = _mm_loadu_si128(chunk16(data, 48));
        let mut v4 = _mm_loadu_si128(chunk16(data, 64));
        let mut v5 = _mm_loadu_si128(chunk16(data, 80));
        let mut v6 = _mm_loadu_si128(chunk16(data, 96));
        let mut v7 = _mm_loadu_si128(chunk16(data, 112));
        pos = 128;

        // Main loop: 128 bytes per iteration (8 vectors × 16 bytes)
        while pos + 128 <= len {
            v0 = fold128!(v0, _mm_loadu_si128(chunk16(data, pos)), mults_8v);
            v1 = fold128!(v1, _mm_loadu_si128(chunk16(data, pos + 16)), mults_8v);
            v2 = fold128!(v2, _mm_loadu_si128(chunk16(data, pos + 32)), mults_8v);
            v3 = fold128!(v3, _mm_loadu_si128(chunk16(data, pos + 48)), mults_8v);
            v4 = fold128!(v4, _mm_loadu_si128(chunk16(data, pos + 64)), mults_8v);
            v5 = fold128!(v5, _mm_loadu_si128(chunk16(data, pos + 80)), mults_8v);
            v6 = fold128!(v6, _mm_loadu_si128(chunk16(data, pos + 96)), mults_8v);
            v7 = fold128!(v7, _mm_loadu_si128(chunk16(data, pos + 112)), mults_8v);
            pos += 128;
        }

        // Fold 8 → 4 accumulators
        v0 = fold128!(v0, v4, mults_4v);
        v1 = fold128!(v1, v5, mults_4v);
        v2 = fold128!(v2, v6, mults_4v);
        v3 = fold128!(v3, v7, mults_4v);
        if pos + 64 <= len {
            v0 = fold128!(v0, _mm_loadu_si128(chunk16(data, pos)), mults_4v);
            v1 = fold128!(v1, _mm_loadu_si128(chunk16(data, pos + 16)), mults_4v);
            v2 = fold128!(v2, _mm_loadu_si128(chunk16(data, pos + 32)), mults_4v);
            v3 = fold128!(v3, _mm_loadu_si128(chunk16(data, pos + 48)), mults_4v);
            pos += 64;
        }

        // Fold 4 → 2 accumulators
        v0 = fold128!(v0, v2, mults_2v);
        v1 = fold128!(v1, v3, mults_2v);
        if pos + 32 <= len {
            v0 = fold128!(v0, _mm_loadu_si128(chunk16(data, pos)), mults_2v);
            v1 = fold128!(v1, _mm_loadu_si128(chunk16(data, pos + 16)), mults_2v);
            pos += 32;
        }

        // Fold 2 → 1 accumulator
        x0 = fold128!(v0, v1, mults_1v);
        if pos + 16 <= len {
            x0 = fold128!(x0, _mm_loadu_si128(chunk16(data, pos)), mults_1v);
            pos += 16;
        }
    } else {
        // Small path: fold 16-byte chunks one at a time
        while pos + 16 <= len {
            x0 = fold128!(x0, _mm_loadu_si128(chunk16(data, pos)), mults_1v);
            pos += 16;
        }
    }

    // Handle remaining 1-15 bytes with scalar
    let tail = len - pos;
    if tail > 0 {
        let partial = barrett_reduce!(x0, mults_1v, barrett);
        return crc32_slice8(partial, &data[pos..]);
    }

    // Barrett reduction: 128-bit → 32-bit CRC
    barrett_reduce!(x0, mults_1v, barrett)
}

// ---------------------------------------------------------------------------
// NEON PMULL CRC-32 folding (aarch64: NeonAesToken = NEON + PMULL)
//
// Uses vmull_p64 for carryless multiplication — same folding algorithm as
// x86 PCLMULQDQ but with ARM PMULL instructions. 4-accumulator path for
// large data, Barrett reduction to 32 bits.
// Ported from libdeflate's arm/crc32_impl.h (crc32_arm_pmullx4).
// ---------------------------------------------------------------------------

/// clmul_low(src, mults) = lo64(src) * lo64(mults) -> 128-bit as u8x16
#[cfg(target_arch = "aarch64")]
macro_rules! neon_clmul_low {
    ($a:expr, $b:expr) => {{
        let a = $a;
        let b = $b;
        vreinterpretq_u8_p128(vmull_p64(
            vgetq_lane_p64(vreinterpretq_p64_u8(a), 0),
            vgetq_lane_p64(b, 0),
        ))
    }};
}

/// clmul_high(src, mults) = hi64(src) * hi64(mults) -> 128-bit as u8x16
#[cfg(target_arch = "aarch64")]
macro_rules! neon_clmul_high {
    ($a:expr, $b:expr) => {{
        let a = $a;
        let b = $b;
        vreinterpretq_u8_p128(vmull_high_p64(vreinterpretq_p64_u8(a), b))
    }};
}

/// fold_vec(src, dst, mults) = dst XOR clmul_low(src, mults) XOR clmul_high(src, mults)
#[cfg(target_arch = "aarch64")]
macro_rules! neon_fold_vec {
    ($src:expr, $dst:expr, $mults:expr) => {{
        let src = $src;
        let mults = $mults;
        let a = neon_clmul_low!(src, mults);
        let b = neon_clmul_high!(src, mults);
        veorq_u8(veorq_u8(a, b), $dst)
    }};
}

/// Barrett reduction: 128-bit polynomial -> 32-bit CRC
#[cfg(target_arch = "aarch64")]
macro_rules! neon_barrett_reduce {
    ($v0:expr, $c0:expr, $c1:expr, $c2:expr) => {{
        let zero = vdupq_n_u8(0);
        // Fold 128 -> 64 bits: multiply low half by x^64 constant, XOR with high half
        let x0 = veorq_u8(neon_clmul_low!($v0, $c0), vextq_u8($v0, zero, 8));
        // Barrett step 1
        let x1 = neon_clmul_low!(x0, $c1);
        // Barrett step 2
        let x1 = neon_clmul_low!(x1, $c2);
        // Extract CRC from lane 2 (bytes 8-11)
        vgetq_lane_u32(vreinterpretq_u32_u8(veorq_u8(x0, x1)), 2)
    }};
}

#[cfg(target_arch = "aarch64")]
#[arcane]
fn crc32_impl_neon_aes(_token: NeonAesToken, crc: u32, data: &[u8]) -> u32 {
    let len = data.len();

    // Need at least 16 bytes for PMULL path
    if len < 16 {
        return crc32_slice8(crc, data);
    }

    // Fold constants: x^N mod G(x), loaded as poly64x2_t pairs.
    // Same constants as the x86 PCLMULQDQ path.
    static MULTS_1V: [u64; 2] = [CRC32_X159_MODG as u64, CRC32_X95_MODG as u64];
    static MULTS_4V: [u64; 2] = [CRC32_X543_MODG as u64, CRC32_X479_MODG as u64];
    static MULTS_2V: [u64; 2] = [CRC32_X287_MODG as u64, CRC32_X223_MODG as u64];
    static BARRETT_0: [u64; 2] = [CRC32_X95_MODG as u64, 0];
    static BARRETT_1: [u64; 2] = [CRC32_BARRETT_1 as u64, 0];
    static BARRETT_2: [u64; 2] = [CRC32_BARRETT_2 as u64, 0];

    let multipliers_1 = vreinterpretq_p64_u64(vld1q_u64(&MULTS_1V));
    let multipliers_4 = vreinterpretq_p64_u64(vld1q_u64(&MULTS_4V));
    let multipliers_2 = vreinterpretq_p64_u64(vld1q_u64(&MULTS_2V));
    let barrett_c0 = vreinterpretq_p64_u64(vld1q_u64(&BARRETT_0));
    let barrett_c1 = vreinterpretq_p64_u64(vld1q_u64(&BARRETT_1));
    let barrett_c2 = vreinterpretq_p64_u64(vld1q_u64(&BARRETT_2));

    #[inline(always)]
    fn ld16(data: &[u8], off: usize) -> &[u8; 16] {
        data[off..off + 16].try_into().unwrap()
    }

    let mut p = data;

    // Initialize: load first 16 bytes, XOR CRC into low 32 bits
    let crc_vec = vreinterpretq_u8_u32(vsetq_lane_u32(crc, vdupq_n_u32(0), 0));
    let mut v0 = veorq_u8(vld1q_u8(ld16(p, 0)), crc_vec);
    p = &p[16..];

    if p.len() >= 48 {
        // 4-accumulator path: load 3 more vectors
        let mut v1 = vld1q_u8(ld16(p, 0));
        let mut v2 = vld1q_u8(ld16(p, 16));
        let mut v3 = vld1q_u8(ld16(p, 32));
        p = &p[48..];

        // Main loop: fold 64 bytes per iteration (4 vectors x 16 bytes)
        while p.len() >= 64 {
            v0 = neon_fold_vec!(v0, vld1q_u8(ld16(p, 0)), multipliers_4);
            v1 = neon_fold_vec!(v1, vld1q_u8(ld16(p, 16)), multipliers_4);
            v2 = neon_fold_vec!(v2, vld1q_u8(ld16(p, 32)), multipliers_4);
            v3 = neon_fold_vec!(v3, vld1q_u8(ld16(p, 48)), multipliers_4);
            p = &p[64..];
        }

        // Fold 4 -> 2 accumulators
        v0 = neon_fold_vec!(v0, v2, multipliers_2);
        v1 = neon_fold_vec!(v1, v3, multipliers_2);
        if p.len() >= 32 {
            v0 = neon_fold_vec!(v0, vld1q_u8(ld16(p, 0)), multipliers_2);
            v1 = neon_fold_vec!(v1, vld1q_u8(ld16(p, 16)), multipliers_2);
            p = &p[32..];
        }

        // Fold 2 -> 1 accumulator
        v0 = neon_fold_vec!(v0, v1, multipliers_1);
        if p.len() >= 16 {
            v0 = neon_fold_vec!(v0, vld1q_u8(ld16(p, 0)), multipliers_1);
            p = &p[16..];
        }
    } else {
        // Small path: fold 16-byte chunks one at a time
        while p.len() >= 16 {
            v0 = neon_fold_vec!(v0, vld1q_u8(ld16(p, 0)), multipliers_1);
            p = &p[16..];
        }
    }

    // Handle remaining 1-15 bytes with scalar
    if !p.is_empty() {
        let partial = neon_barrett_reduce!(v0, barrett_c0, barrett_c1, barrett_c2);
        return crc32_slice8(partial, p);
    }

    neon_barrett_reduce!(v0, barrett_c0, barrett_c1, barrett_c2)
}

// ---------------------------------------------------------------------------
// Scalar fallback: slice-by-8
// ---------------------------------------------------------------------------

fn crc32_impl_scalar(_token: ScalarToken, crc: u32, data: &[u8]) -> u32 {
    crc32_slice8(crc, data)
}

/// Core slice-by-8 CRC-32 implementation.
///
/// Processes data in 8-byte chunks using 8 parallel table lookups,
/// with byte-at-a-time processing for the remainder.
fn crc32_slice8(mut crc: u32, data: &[u8]) -> u32 {
    let table = &CRC32_SLICE8_TABLE;

    // Process leading bytes to make the remainder a multiple of 8
    let lead = data.len() % 8;
    for &b in &data[..lead] {
        crc = (crc >> 8) ^ table[((crc as u8) ^ b) as usize];
    }

    // Main loop: 8 bytes at a time via chunks_exact
    for chunk in data[lead..].chunks_exact(8) {
        let v1 = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
        let v2 = u32::from_le_bytes([chunk[4], chunk[5], chunk[6], chunk[7]]);

        #[allow(clippy::identity_op)]
        {
            crc = table[0x700 + ((crc ^ v1) as u8) as usize]
                ^ table[0x600 + (((crc ^ v1) >> 8) as u8) as usize]
                ^ table[0x500 + (((crc ^ v1) >> 16) as u8) as usize]
                ^ table[0x400 + (((crc ^ v1) >> 24) as u8) as usize]
                ^ table[0x300 + (v2 as u8) as usize]
                ^ table[0x200 + ((v2 >> 8) as u8) as usize]
                ^ table[0x100 + ((v2 >> 16) as u8) as usize]
                ^ table[0x000 + ((v2 >> 24) as u8) as usize];
        }
    }

    crc
}

/// Builder-style CRC-32 hasher — drop-in replacement for `crc32fast::Hasher`.
///
/// Wraps the SIMD-accelerated [`crc32`] function in a struct that tracks
/// running state and byte count (for [`combine`](Crc32Hasher::combine)).
///
/// ```
/// use zenflate::Crc32Hasher;
///
/// let mut h = Crc32Hasher::new();
/// h.update(b"Hello");
/// h.update(b" World");
/// assert_eq!(h.finalize(), zenflate::crc32(0, b"Hello World"));
/// ```
#[derive(Clone, Debug)]
pub struct Crc32Hasher {
    crc: u32,
    amount: u64,
}

impl Crc32Hasher {
    /// Create a new hasher with initial CRC of 0.
    pub fn new() -> Self {
        Self { crc: 0, amount: 0 }
    }

    /// Create a hasher seeded with a pre-existing CRC value.
    pub fn new_with_initial(init: u32) -> Self {
        Self {
            crc: init,
            amount: 0,
        }
    }

    /// One-shot CRC-32. Matches `crc32fast::hash()`.
    #[must_use]
    pub fn hash(data: &[u8]) -> u32 {
        crc32(0, data)
    }

    /// Feed data into the running CRC.
    pub fn update(&mut self, buf: &[u8]) {
        self.crc = crc32(self.crc, buf);
        self.amount += buf.len() as u64;
    }

    /// Return the current CRC (non-consuming).
    #[must_use]
    pub fn finalize(&self) -> u32 {
        self.crc
    }

    /// Reset to the initial state.
    pub fn reset(&mut self) {
        self.crc = 0;
        self.amount = 0;
    }

    /// Combine another hasher's state into this one.
    ///
    /// Equivalent to appending `other`'s input data after `self`'s.
    /// Both hashers must have been started from initial CRC of 0.
    pub fn combine(&mut self, other: &Self) {
        self.crc = crc32_combine(self.crc, other.crc, other.amount as usize);
        self.amount += other.amount;
    }

    /// Total bytes fed so far.
    pub fn amount(&self) -> u64 {
        self.amount
    }
}

impl Default for Crc32Hasher {
    fn default() -> Self {
        Self::new()
    }
}

impl core::hash::Hasher for Crc32Hasher {
    fn finish(&self) -> u64 {
        self.crc as u64
    }

    fn write(&mut self, bytes: &[u8]) {
        self.update(bytes);
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_initial_value() {
        assert_eq!(crc32(0, &[]), 0);
    }

    #[test]
    fn test_known_value() {
        // CRC-32 of "123456789" is 0xCBF43926
        assert_eq!(crc32(0, b"123456789"), 0xCBF43926);
    }

    #[test]
    fn test_incremental() {
        let data = b"Hello World";
        let full = crc32(0, data);
        let partial = crc32(0, &data[..5]);
        let incremental = crc32(partial, &data[5..]);
        assert_eq!(full, incremental);
    }

    #[test]
    fn hasher_new_update_finalize() {
        let mut h = Crc32Hasher::new();
        h.update(b"Hello");
        h.update(b" World");
        assert_eq!(h.finalize(), crc32(0, b"Hello World"));
        assert_eq!(h.amount(), 11);
    }

    #[test]
    fn hasher_default() {
        let h = Crc32Hasher::default();
        assert_eq!(h.finalize(), 0);
        assert_eq!(h.amount(), 0);
    }

    #[test]
    fn hasher_new_with_initial() {
        let partial = crc32(0, b"Hello");
        let mut h = Crc32Hasher::new_with_initial(partial);
        h.update(b" World");
        assert_eq!(h.finalize(), crc32(0, b"Hello World"));
    }

    #[test]
    fn hasher_hash_one_shot() {
        assert_eq!(Crc32Hasher::hash(b"123456789"), 0xCBF43926);
        assert_eq!(Crc32Hasher::hash(b"123456789"), crc32(0, b"123456789"));
    }

    #[test]
    fn hasher_reset() {
        let mut h = Crc32Hasher::new();
        h.update(b"data");
        h.reset();
        assert_eq!(h.finalize(), 0);
        assert_eq!(h.amount(), 0);
    }

    #[test]
    fn hasher_combine() {
        let mut h1 = Crc32Hasher::new();
        h1.update(b"Hello, ");
        let mut h2 = Crc32Hasher::new();
        h2.update(b"World!");
        h1.combine(&h2);
        assert_eq!(h1.finalize(), crc32(0, b"Hello, World!"));
        assert_eq!(h1.amount(), 13);
    }

    #[test]
    fn hasher_core_hash_hasher_trait() {
        use core::hash::Hasher;
        let mut h = Crc32Hasher::new();
        Hasher::write(&mut h, b"Hello World");
        assert_eq!(Hasher::finish(&h), crc32(0, b"Hello World") as u64);
    }

    #[test]
    fn hasher_clone() {
        let mut h = Crc32Hasher::new();
        h.update(b"Hello");
        let h2 = h.clone();
        assert_eq!(h.finalize(), h2.finalize());
        assert_eq!(h.amount(), h2.amount());
    }

    #[test]
    fn hasher_empty_update() {
        let mut h = Crc32Hasher::new();
        h.update(b"");
        assert_eq!(h.finalize(), 0);
        assert_eq!(h.amount(), 0);
    }
}

// All parity tests use libdeflater (C FFI) for comparison.
#[cfg(all(test, not(miri)))]
mod parity {
    use super::*;

    fn check_parity(data: &[u8]) {
        let ours = crc32(0, data);
        let theirs = libdeflater::crc32(data);
        assert_eq!(ours, theirs, "crc32 mismatch for {} bytes", data.len());
    }

    fn check_parity_incremental(data: &[u8], split: usize) {
        let split = split.min(data.len());
        let ours = {
            let c = crc32(0, &data[..split]);
            crc32(c, &data[split..])
        };
        let theirs = libdeflater::crc32(data);
        assert_eq!(
            ours,
            theirs,
            "incremental crc32 mismatch for {} bytes split at {}",
            data.len(),
            split
        );
    }

    #[test]
    fn parity_empty() {
        check_parity(&[]);
    }

    #[test]
    fn parity_single_byte() {
        for b in 0..=255u8 {
            check_parity(&[b]);
        }
    }

    #[test]
    fn parity_all_zeros() {
        for &len in &[1, 100, 8, 16, 64, 65536] {
            check_parity(&vec![0u8; len]);
        }
    }

    #[test]
    fn parity_all_ff() {
        for &len in &[1, 100, 8, 16, 64, 65536] {
            check_parity(&vec![0xFFu8; len]);
        }
    }

    #[test]
    fn parity_sequential() {
        let data: Vec<u8> = (0..=255).cycle().take(100_000).collect();
        check_parity(&data);
    }

    #[test]
    fn parity_alignment_variants() {
        // Test SIMD boundary alignment: different leading byte counts before 16-byte chunks
        for offset in 0..32 {
            let data: Vec<u8> = (0..=255).cycle().take(1000 + offset).collect();
            check_parity(&data);
        }
    }

    #[test]
    fn parity_incremental() {
        let data: Vec<u8> = (0..=255).cycle().take(20_000).collect();
        for &split in &[0, 1, 7, 8, 9, 15, 16, 17, 100, 127, 128, 129, 10000, 20000] {
            check_parity_incremental(&data, split);
        }
    }

    #[test]
    fn parity_large() {
        let data: Vec<u8> = (0..=255).cycle().take(1_000_000).collect();
        check_parity(&data);
    }

    #[test]
    fn parity_simd_boundaries() {
        // Test lengths around SIMD thresholds: 16, 32, 64, 128, 256
        for len in (0..300).chain([512, 1024, 4096, 65536].iter().copied()) {
            let data: Vec<u8> = (0..=255).cycle().take(len).collect();
            check_parity(&data);
        }
    }

    /// Verify that PCLMULQDQ (_mm_clmulepi64_si128) compiles as safe inside #[arcane] with X64CryptoToken.
    #[test]
    #[cfg(target_arch = "x86_64")]
    fn pclmulqdq_compiles_in_arcane() {
        use archmage::prelude::*;

        #[arcane]
        fn clmul_test(_token: X64CryptoToken, a: u64, b: u64) -> u64 {
            let va = _mm_set_epi64x(0, a as i64);
            let vb = _mm_set_epi64x(0, b as i64);
            let result = _mm_clmulepi64_si128(va, vb, 0x00);
            _mm_extract_epi64(result, 0) as u64
        }

        let Some(token) = X64CryptoToken::summon() else {
            eprintln!("skipping pclmulqdq test: X64CryptoToken not available on this CPU");
            return;
        };
        let result = clmul_test(token, 7, 3);
        // 7 clmul 3 in GF(2): (x²+x+1)(x+1) = x³+1 = 9
        assert_eq!(result, 9);
    }

    #[test]
    fn test_crc32_combine_basic() {
        let data1 = b"Hello, ";
        let data2 = b"World!";
        let full = b"Hello, World!";

        let crc_full = super::crc32(0, full);
        let crc1 = super::crc32(0, data1);
        let crc2 = super::crc32(0, data2);
        let combined = super::crc32_combine(crc1, crc2, data2.len());
        assert_eq!(combined, crc_full);
    }

    #[test]
    fn test_crc32_combine_large() {
        let data: Vec<u8> = (0..=255).cycle().take(100_000).collect();
        for split in [1, 100, 1000, 32768, 50000, 99999] {
            let (a, b) = data.split_at(split);
            let crc_full = super::crc32(0, &data);
            let crc1 = super::crc32(0, a);
            let crc2 = super::crc32(0, b);
            let combined = super::crc32_combine(crc1, crc2, b.len());
            assert_eq!(combined, crc_full, "failed at split={split}");
        }
    }

    #[test]
    fn test_crc32_combine_empty() {
        let data = b"test data";
        let crc = super::crc32(0, data);
        assert_eq!(super::crc32_combine(crc, 0, 0), crc);
        assert_eq!(super::crc32_combine(0, crc, data.len()), crc);
    }

    #[test]
    #[cfg(all(target_arch = "x86_64", feature = "avx512"))]
    fn avx512_modern_crc32_if_available() {
        use archmage::prelude::*;
        // Only test if the CPU actually supports AVX-512 VPCLMULQDQ
        if let Some(_token) = X64V4xToken::summon() {
            let data: Vec<u8> = (0..=255).cycle().take(8192).collect();
            let expected = super::crc32(0, &data);
            assert_eq!(expected, libdeflater::crc32(&data));
        }
    }

    #[test]
    fn hasher_parity_with_libdeflater() {
        let data: Vec<u8> = (0..=255).cycle().take(100_000).collect();
        let expected = libdeflater::crc32(&data);

        // Single update
        let mut h = Crc32Hasher::new();
        h.update(&data);
        assert_eq!(h.finalize(), expected);

        // Incremental updates
        let mut h = Crc32Hasher::new();
        for chunk in data.chunks(1337) {
            h.update(chunk);
        }
        assert_eq!(h.finalize(), expected);

        // Combine
        let (a, b) = data.split_at(50_000);
        let mut h1 = Crc32Hasher::new();
        h1.update(a);
        let mut h2 = Crc32Hasher::new();
        h2.update(b);
        h1.combine(&h2);
        assert_eq!(h1.finalize(), expected);

        // One-shot hash
        assert_eq!(Crc32Hasher::hash(&data), expected);
    }

    /// Verify all SIMD dispatch tiers produce identical results to scalar.
    ///
    /// Uses archmage's `for_each_token_permutation` to disable tokens in every
    /// valid combination, then checks that the dispatched result matches
    /// the reference (libdeflater C). Run with `--test-threads=1` for full
    /// correctness (token disabling is process-wide).
    #[test]
    fn crc32_all_simd_tiers() {
        use archmage::testing::{CompileTimePolicy, for_each_token_permutation};

        // Test data at various sizes to exercise scalar tail, PCLMULQDQ small path,
        // PCLMULQDQ 8-accumulator path, and VPCLMULQDQ 512-bit path.
        let sizes = [
            0, 1, 7, 8, 15, 16, 17, 32, 64, 127, 128, 129, 256, 512, 100_000,
        ];
        let reference: Vec<u32> = sizes
            .iter()
            .map(|&sz| {
                let data: Vec<u8> = (0..=255u8).cycle().take(sz).collect();
                libdeflater::crc32(&data)
            })
            .collect();

        let report = for_each_token_permutation(CompileTimePolicy::Warn, |perm| {
            for (i, &sz) in sizes.iter().enumerate() {
                let data: Vec<u8> = (0..=255u8).cycle().take(sz).collect();
                let result = super::crc32(0, &data);
                assert_eq!(
                    result, reference[i],
                    "crc32 mismatch at size={sz}, tier: {perm}"
                );
            }
        });
        eprintln!("crc32 permutation test: {report}");
    }
}