sthash 0.2.14

A very fast cryptographic hash function for large data.
Documentation
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

use super::*;

impl Hasher {
    #[allow(clippy::cast_ptr_alignment)]
    #[target_feature(enable = "avx2")]
    #[inline]
    pub(crate) unsafe fn hash_avx2(&self, out: &mut Vec<u8>, msg: &[u8]) {
        let mut key_ = &self.key[..];
        let mut k0 = _mm256_loadu_si256(key_.as_ptr().add(0) as *const __m256i);
        let mut k1 = _mm256_loadu_si256(key_.as_ptr().add(4) as *const __m256i);
        let mut k2;
        let mut k3;
        key_ = &key_[8..];
        let (mut sums0, mut sums1, mut sums2, mut sums3) = (
            _mm256_setzero_si256(),
            _mm256_setzero_si256(),
            _mm256_setzero_si256(),
            _mm256_setzero_si256(),
        );
        let mut msg_ = msg;
        let mut remaining = msg_.len();
        while remaining >= 64 {
            let t3 = _mm256_loadu_si256(msg_.as_ptr().add(0) as *const __m256i);
            k2 = _mm256_loadu_si256(key_.as_ptr().add(0) as *const __m256i);
            k3 = _mm256_loadu_si256(key_.as_ptr().add(4) as *const __m256i);
            {
                let t0 = _mm256_add_epi32(k0, t3);
                let t1 = _mm256_add_epi32(k1, t3);
                let t2 = _mm256_add_epi32(k2, t3);
                let t3 = _mm256_add_epi32(k3, t3);
                let t4 = _mm256_shuffle_epi32(t0, 0x10);
                let t0 = _mm256_shuffle_epi32(t0, 0x32);
                let t5 = _mm256_shuffle_epi32(t1, 0x10);
                let t1 = _mm256_shuffle_epi32(t1, 0x32);
                let t6 = _mm256_shuffle_epi32(t2, 0x10);
                let t2 = _mm256_shuffle_epi32(t2, 0x32);
                let t7 = _mm256_shuffle_epi32(t3, 0x10);
                let t3 = _mm256_shuffle_epi32(t3, 0x32);
                let t0 = _mm256_mul_epu32(t0, t4);
                let t1 = _mm256_mul_epu32(t1, t5);
                let t2 = _mm256_mul_epu32(t2, t6);
                let t3 = _mm256_mul_epu32(t3, t7);
                sums0 = _mm256_add_epi64(sums0, t0);
                sums1 = _mm256_add_epi64(sums1, t1);
                sums2 = _mm256_add_epi64(sums2, t2);
                sums3 = _mm256_add_epi64(sums3, t3);
            }
            let t3 = _mm256_loadu_si256(msg_.as_ptr().add(32) as *const __m256i);
            k0 = _mm256_loadu_si256(key_.as_ptr().add(8) as *const __m256i);
            k1 = _mm256_loadu_si256(key_.as_ptr().add(12) as *const __m256i);
            {
                let t0 = _mm256_add_epi32(k2, t3);
                let t1 = _mm256_add_epi32(k3, t3);
                let t2 = _mm256_add_epi32(k0, t3);
                let t3 = _mm256_add_epi32(k1, t3);
                let t4 = _mm256_shuffle_epi32(t0, 0x10);
                let t0 = _mm256_shuffle_epi32(t0, 0x32);
                let t5 = _mm256_shuffle_epi32(t1, 0x10);
                let t1 = _mm256_shuffle_epi32(t1, 0x32);
                let t6 = _mm256_shuffle_epi32(t2, 0x10);
                let t2 = _mm256_shuffle_epi32(t2, 0x32);
                let t7 = _mm256_shuffle_epi32(t3, 0x10);
                let t3 = _mm256_shuffle_epi32(t3, 0x32);
                let t0 = _mm256_mul_epu32(t0, t4);
                let t1 = _mm256_mul_epu32(t1, t5);
                let t2 = _mm256_mul_epu32(t2, t6);
                let t3 = _mm256_mul_epu32(t3, t7);
                sums0 = _mm256_add_epi64(sums0, t0);
                sums1 = _mm256_add_epi64(sums1, t1);
                sums2 = _mm256_add_epi64(sums2, t2);
                sums3 = _mm256_add_epi64(sums3, t3);
            }
            msg_ = &msg_[64..];
            key_ = &key_[16..];
            remaining -= 64;
        }
        assert_eq!(remaining, 0);

        let t0 = _mm256_unpacklo_epi64(sums0, sums1);
        let t1 = _mm256_unpackhi_epi64(sums0, sums1);
        let t2 = _mm256_unpacklo_epi64(sums2, sums3);
        let t3 = _mm256_unpackhi_epi64(sums2, sums3);

        let t4 = _mm256_inserti128_si256(t0, _mm256_castsi256_si128(t2), 0x1);
        let t5 = _mm256_inserti128_si256(t1, _mm256_castsi256_si128(t3), 0x1);
        let t0 = _mm256_permute2x128_si256(t0, t2, 0x31);
        let t1 = _mm256_permute2x128_si256(t1, t3, 0x31);

        let t4 = _mm256_add_epi64(t4, t5);
        let t0 = _mm256_add_epi64(t0, t1);
        let t0 = _mm256_add_epi64(t0, t4);

        let idx = out.len();
        out.reserve(32);
        out.set_len(out.len() + 32);
        let addr = out.as_mut_ptr().add(idx);
        _mm256_storeu_si256(addr as *mut __m256i, t0);
    }
}