axhash-core 1.0.0

use crate::constants::{SECRET, SECRET_STREAM, SECRET_STREAM_LEN, STRIPE_SECRET};
use crate::math::folded_multiply;
use crate::memory::r_u64;

pub(crate) const STRIPE_BYTES: usize = 64;
pub(crate) const STRIPE_LANES: usize = 8;
pub(crate) const BLOCK_STRIPES: usize = 16;
pub(crate) const ACC_INIT_SECRET_OFFSET: usize = 0;
pub(crate) const STRIPE_SECRET_BASE: usize = 8;
// Scramble takes one stripe-sized window of secret; reserve it just below
// the final-stripe secret. Final stripe is the very last 8 u64 of the stream.
pub(crate) const SCRAMBLE_SECRET_OFFSET: usize = SECRET_STREAM_LEN - 2 * STRIPE_LANES;
pub(crate) const FINAL_STRIPE_SECRET_OFFSET: usize = SECRET_STREAM_LEN - STRIPE_LANES;

// PRIME32_1 from XXH3; chosen because (a) it is coprime to 2^64, so multiply
// is a bijection over u64, and (b) being 32-bit means SIMD backends can
// implement the 64x32 wrapping multiply with two 32x32->64 ops + shift + add.
pub(crate) const SCRAMBLE_PRIME: u64 = 0x9E37_79B1;

#[inline(always)]
pub(super) unsafe fn read_partial_u64(ptr: *const u8, len: usize) -> u64 {
    debug_assert!((1..=7).contains(&len));
    if len >= 4 {
        let a = u32::from_le(unsafe { core::ptr::read_unaligned(ptr.cast::<u32>()) });
        let b = u32::from_le(unsafe { core::ptr::read_unaligned(ptr.add(len - 4).cast::<u32>()) });
        (a as u64) | ((b as u64) << 32)
    } else {
        let c1 = unsafe { *ptr } as u64;
        let c2 = unsafe { *ptr.add(len >> 1) } as u64;
        let c3 = unsafe { *ptr.add(len - 1) } as u64;
        let raw = c1 | (c2 << 8) | (c3 << 16);
        let mask = (1u64 << (len << 3)) - 1;
        raw & mask
    }
}

#[inline(always)]
pub(super) unsafe fn hash_bytes_short(ptr: *const u8, len: usize, acc: u64) -> u64 {
    debug_assert!((1..=16).contains(&len));
    let len_u64 = len as u64;
    let len_mix = len_u64.wrapping_mul(0x9E3779B97F4A7C15);

    if len >= 8 {
        let lo = unsafe { r_u64(ptr) };
        let hi = unsafe { r_u64(ptr.add(len - 8)) };

        let m0 = folded_multiply(lo ^ SECRET[0], hi ^ STRIPE_SECRET[0] ^ len_mix);
        folded_multiply(m0 ^ acc, len_mix ^ SECRET[1])
    } else if len == 4 {
        let v = u32::from_le(unsafe { core::ptr::read_unaligned(ptr.cast::<u32>()) }) as u64;
        let lo = v | (v << 32);
        let hi = lo.rotate_left(17) ^ 4 ^ SECRET[0];
        let m0 = folded_multiply(lo ^ SECRET[1] ^ acc, hi ^ STRIPE_SECRET[1]);
        folded_multiply(m0, len_mix ^ STRIPE_SECRET[0])
    } else {
        let lo = unsafe { read_partial_u64(ptr, len) };
        let hi = lo.rotate_left(17) ^ len_u64 ^ SECRET[0];

        let m0 = folded_multiply(lo ^ SECRET[1] ^ acc, hi ^ STRIPE_SECRET[1]);
        folded_multiply(m0, len_mix ^ STRIPE_SECRET[0])
    }
}

#[inline(always)]
pub(super) unsafe fn hash_bytes_17_32(ptr: *const u8, len: usize, acc: u64) -> u64 {
    let a = unsafe { r_u64(ptr) };
    let b = unsafe { r_u64(ptr.add(8)) };
    let c = unsafe { r_u64(ptr.add(len - 16)) };
    let d = unsafe { r_u64(ptr.add(len - 8)) };

    let len_mix = (len as u64).wrapping_mul(0x9E3779B97F4A7C15);

    let front = folded_multiply(a ^ SECRET[0], b ^ STRIPE_SECRET[0]);
    let back = folded_multiply(c ^ SECRET[1], d ^ STRIPE_SECRET[1] ^ len_mix);
    let mix = front ^ back.rotate_left(17);

    folded_multiply(mix ^ acc, len_mix ^ SECRET[2])
}

#[inline(always)]
pub(super) unsafe fn hash_bytes_33_64(ptr: *const u8, len: usize, acc: u64) -> u64 {
    let w0 = unsafe { r_u64(ptr) };
    let w1 = unsafe { r_u64(ptr.add(8)) };
    let w2 = unsafe { r_u64(ptr.add(16)) };
    let w3 = unsafe { r_u64(ptr.add(24)) };
    let w4 = unsafe { r_u64(ptr.add(len - 32)) };
    let w5 = unsafe { r_u64(ptr.add(len - 24)) };
    let w6 = unsafe { r_u64(ptr.add(len - 16)) };
    let w7 = unsafe { r_u64(ptr.add(len - 8)) };

    let len_mix = (len as u64).wrapping_mul(0x9E3779B97F4A7C15);

    let f0 = folded_multiply(w0 ^ SECRET[0], w1 ^ STRIPE_SECRET[0]);
    let f1 = folded_multiply(w2 ^ SECRET[1], w3 ^ STRIPE_SECRET[1]);
    let front = f0 ^ f1.rotate_left(17);

    let b0 = folded_multiply(w4 ^ SECRET[2], w5 ^ STRIPE_SECRET[2] ^ len_mix);
    let b1 = folded_multiply(w6 ^ SECRET[3], w7 ^ STRIPE_SECRET[3]);
    let back = b0 ^ b1.rotate_left(21);

    let m = folded_multiply(front ^ acc, back ^ STRIPE_SECRET[0]);
    folded_multiply(m ^ len_mix.rotate_left(23), SECRET[1] ^ acc)
}

#[inline(always)]
pub(super) unsafe fn hash_bytes_65_128(ptr: *const u8, len: usize, acc: u64) -> u64 {
    let w0 = unsafe { r_u64(ptr) };
    let w1 = unsafe { r_u64(ptr.add(8)) };
    let w2 = unsafe { r_u64(ptr.add(16)) };
    let w3 = unsafe { r_u64(ptr.add(24)) };
    let w4 = unsafe { r_u64(ptr.add(32)) };
    let w5 = unsafe { r_u64(ptr.add(40)) };
    let w6 = unsafe { r_u64(ptr.add(48)) };
    let w7 = unsafe { r_u64(ptr.add(56)) };
    let w8 = unsafe { r_u64(ptr.add(len - 64)) };
    let w9 = unsafe { r_u64(ptr.add(len - 56)) };
    let wa = unsafe { r_u64(ptr.add(len - 48)) };
    let wb = unsafe { r_u64(ptr.add(len - 40)) };
    let wc = unsafe { r_u64(ptr.add(len - 32)) };
    let wd = unsafe { r_u64(ptr.add(len - 24)) };
    let we = unsafe { r_u64(ptr.add(len - 16)) };
    let wf = unsafe { r_u64(ptr.add(len - 8)) };
    let m0 = folded_multiply(w0 ^ SECRET[0], w1 ^ STRIPE_SECRET[0] ^ acc);
    let m1 = folded_multiply(w2 ^ SECRET[1], w3 ^ STRIPE_SECRET[1]);
    let m2 = folded_multiply(w4 ^ SECRET[2], w5 ^ STRIPE_SECRET[2]);
    let m3 = folded_multiply(w6 ^ SECRET[3], w7 ^ STRIPE_SECRET[3]);
    let len_mix = (len as u64).wrapping_mul(0x9E3779B97F4A7C15);
    let m4 = folded_multiply(w8 ^ SECRET[0], w9 ^ STRIPE_SECRET[0] ^ len_mix);
    let m5 = folded_multiply(wa ^ SECRET[1], wb ^ STRIPE_SECRET[1]);
    let m6 = folded_multiply(wc ^ SECRET[2], wd ^ STRIPE_SECRET[2]);
    let m7 = folded_multiply(we ^ SECRET[3], wf ^ STRIPE_SECRET[3]);
    let s0 = m0 ^ m4.rotate_left(17);
    let s1 = m1 ^ m5.rotate_left(19);
    let s2 = m2 ^ m6.rotate_left(23);
    let s3 = m3 ^ m7.rotate_left(29);
    let x = folded_multiply(s0 ^ STRIPE_SECRET[0], s1 ^ STRIPE_SECRET[1]);
    let y = folded_multiply(s2 ^ STRIPE_SECRET[2], s3 ^ STRIPE_SECRET[3]);
    folded_multiply(x ^ y.rotate_left(17), acc ^ len_mix)
}

// Canonical long-input mixing kernel.
//
// The same arithmetic is implemented bit-identically by every SIMD backend
// (aarch64, x86_64), so all CPUs produce the same hash for the same input.
// Operations used (XOR, ADD, 32x32->64 multiply, lane rotates) are exact on
// every architecture; we deliberately avoid AES round instructions because
// x86 `aesenc` and ARM `aese`+`aesmc` are not bit-equivalent per round.
#[inline(always)]
pub(crate) unsafe fn mix_stripe(acc: &mut [u64; STRIPE_LANES], data: *const u8, secret: *const u64) {
    let mut i = 0usize;
    while i < STRIPE_LANES {
        let d = unsafe { r_u64(data.add(i * 8)) };
        let s = unsafe { core::ptr::read_unaligned(secret.add(i)) };
        let k = d ^ s;
        let lo32 = k & 0xFFFF_FFFF;
        let hi32 = k >> 32;
        acc[i ^ 1] = acc[i ^ 1].wrapping_add(d);
        acc[i] = acc[i].wrapping_add(lo32.wrapping_mul(hi32));
        i += 1;
    }
}

#[inline(always)]
pub(crate) fn init_acc(acc_in: u64) -> [u64; STRIPE_LANES] {
    [
        acc_in ^ SECRET_STREAM[ACC_INIT_SECRET_OFFSET],
        acc_in.rotate_left(13).wrapping_add(SECRET_STREAM[ACC_INIT_SECRET_OFFSET + 1]),
        acc_in.rotate_left(29) ^ SECRET_STREAM[ACC_INIT_SECRET_OFFSET + 2],
        acc_in.rotate_left(43).wrapping_add(SECRET_STREAM[ACC_INIT_SECRET_OFFSET + 3]),
        acc_in.rotate_left(7) ^ SECRET_STREAM[ACC_INIT_SECRET_OFFSET + 4],
        acc_in.rotate_left(19).wrapping_add(SECRET_STREAM[ACC_INIT_SECRET_OFFSET + 5]),
        acc_in.rotate_left(37) ^ SECRET_STREAM[ACC_INIT_SECRET_OFFSET + 6],
        acc_in.rotate_left(53).wrapping_add(SECRET_STREAM[ACC_INIT_SECRET_OFFSET + 7]),
    ]
}

// Final merge of the 8 accumulators into a u64. Each pair (acc[2i], acc[2i+1])
// gets its own 128-bit fold, then all four folds are summed together with the
// input length and seeded accumulator. A SplitMix64-style avalanche tail
// spreads any remaining structural bias evenly across all 64 output bits.
// Verified against SMHasher3.
#[inline(always)]
pub(crate) fn merge_acc(acc: &[u64; STRIPE_LANES], len: usize, acc_in: u64) -> u64 {
    let f0 = folded_multiply(acc[0] ^ SECRET_STREAM[0], acc[1] ^ SECRET_STREAM[1]);
    let f1 = folded_multiply(acc[2] ^ SECRET_STREAM[2], acc[3] ^ SECRET_STREAM[3]);
    let f2 = folded_multiply(acc[4] ^ SECRET_STREAM[4], acc[5] ^ SECRET_STREAM[5]);
    let f3 = folded_multiply(acc[6] ^ SECRET_STREAM[6], acc[7] ^ SECRET_STREAM[7]);

    let len_mix = (len as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15);
    let mut h = f0
        .wrapping_add(f1)
        .wrapping_add(f2)
        .wrapping_add(f3)
        .wrapping_add(len_mix)
        .wrapping_add(acc_in);

    // SplitMix64 finalizer
    h ^= h >> 33;
    h = h.wrapping_mul(0xFF51_AFD7_ED55_8CCD);
    h ^= h >> 33;
    h = h.wrapping_mul(0xC4CE_B9FE_1A85_EC53);
    h ^= h >> 33;
    h
}

// Scramble step. Without this, the stripe accumulation is commutative — two
// inputs that differ only in WHICH stripe a given bit appears at, when those
// stripes happen to share a secret offset, produce identical hashes. The
// scramble injects non-linearity (`v ^= v >> 47`) and stripe-position
// dependence (running between blocks instead of after the full input),
// breaking that symmetry. Verified against SMHasher3's Sparse/OneByte/Long
// keysets.
#[inline(always)]
pub(crate) unsafe fn scramble_acc(acc: &mut [u64; STRIPE_LANES], secret: *const u64) {
    let mut i = 0usize;
    while i < STRIPE_LANES {
        let s = unsafe { core::ptr::read_unaligned(secret.add(i)) };
        let mut v = acc[i];
        v ^= v >> 47;
        v ^= s;
        v = v.wrapping_mul(SCRAMBLE_PRIME);
        acc[i] = v;
        i += 1;
    }
}

#[inline(always)]
pub(crate) unsafe fn hash_bytes_long(ptr: *const u8, len: usize, acc_in: u64) -> u64 {
    let mut acc = init_acc(acc_in);
    let secret_base = SECRET_STREAM.as_ptr();
    let scramble_secret = unsafe { secret_base.add(SCRAMBLE_SECRET_OFFSET) };

    let mut offset = 0usize;
    let mut stripe_in_block = 0usize;
    let mut secret_off = STRIPE_SECRET_BASE;

    while offset + STRIPE_BYTES <= len {
        unsafe {
            mix_stripe(&mut acc, ptr.add(offset), secret_base.add(secret_off));
        }
        offset += STRIPE_BYTES;
        secret_off += 1;
        stripe_in_block += 1;

        if stripe_in_block == BLOCK_STRIPES {
            unsafe { scramble_acc(&mut acc, scramble_secret) };
            stripe_in_block = 0;
            secret_off = STRIPE_SECRET_BASE;
        }
    }

    // Trailing stripe (may overlap the previous one). Always uses a fixed
    // secret offset so the tail influences every input length.
    unsafe {
        mix_stripe(
            &mut acc,
            ptr.add(len - STRIPE_BYTES),
            secret_base.add(FINAL_STRIPE_SECRET_OFFSET),
        );
    }

    merge_acc(&acc, len, acc_in)
}