cubehash 0.4.1

CubeHash (rev2/rev3) hashing with SIMD acceleration and portable fallback
Documentation
#[cfg(any(feature = "force-scalar", all(target_arch = "wasm32", not(target_feature = "simd128"))
    , not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64", target_arch = "wasm32"))))]
mod scalar_backend {
    use crate::{Backend, BLOCKSIZE, ROUNDS, CubeHashParams, rounds_for_rev};

    #[repr(align(16))]
    #[derive(Clone, Copy)]
    struct U32x4([u32; 4]);

    #[inline(always)]
    fn add(v: U32x4, w: U32x4) -> U32x4 {
        U32x4([
            v.0[0].wrapping_add(w.0[0]),
            v.0[1].wrapping_add(w.0[1]),
            v.0[2].wrapping_add(w.0[2]),
            v.0[3].wrapping_add(w.0[3]),
        ])
    }

    #[inline(always)]
    fn xor(v: U32x4, w: U32x4) -> U32x4 {
        U32x4([v.0[0] ^ w.0[0], v.0[1] ^ w.0[1], v.0[2] ^ w.0[2], v.0[3] ^ w.0[3]])
    }

    #[inline(always)]
    fn shlxor<const N: u32>(v: U32x4) -> U32x4 {
        U32x4([
            (v.0[0].wrapping_shl(N)) ^ (v.0[0].wrapping_shr(32 - N)),
            (v.0[1].wrapping_shl(N)) ^ (v.0[1].wrapping_shr(32 - N)),
            (v.0[2].wrapping_shl(N)) ^ (v.0[2].wrapping_shr(32 - N)),
            (v.0[3].wrapping_shl(N)) ^ (v.0[3].wrapping_shr(32 - N)),
        ])
    }

    impl U32x4 {
        #[inline(always)]
        fn new(a: u32, b: u32, c: u32, d: u32) -> Self {
            U32x4([a, b, c, d])
        }

        #[inline(always)]
        fn permute_badc(self) -> U32x4 {
            U32x4([self.0[1], self.0[0], self.0[3], self.0[2]])
        }

        #[inline(always)]
        fn permute_cdab(self) -> U32x4 {
            U32x4([self.0[2], self.0[3], self.0[0], self.0[1]])
        }

        #[inline(always)]
        fn load_bytes(data: &[u8]) -> U32x4 {
            U32x4([
                u32::from_le_bytes([data[12], data[13], data[14], data[15]]),
                u32::from_le_bytes([data[8],  data[9],  data[10], data[11]]),
                u32::from_le_bytes([data[4],  data[5],  data[6],  data[7]]),
                u32::from_le_bytes([data[0],  data[1],  data[2],  data[3]])
            ])
        }

        #[inline(always)]
        pub fn transmute(self) -> Vec<u8> {
            [
                self.0[3].to_le_bytes(),
                self.0[2].to_le_bytes(),
                self.0[1].to_le_bytes(),
                self.0[0].to_le_bytes()
            ].concat()
        }
    }

    pub struct Scalar {
        x0: U32x4, x1: U32x4, x2: U32x4, x3: U32x4,
        x4: U32x4, x5: U32x4, x6: U32x4, x7: U32x4,
    }

    impl Scalar {
        #[inline(always)]
        fn rounds(&mut self) {
            for _ in 0..ROUNDS {
                self.x4 = add(self.x0, self.x4.permute_badc());
                self.x5 = add(self.x1, self.x5.permute_badc());
                self.x6 = add(self.x2, self.x6.permute_badc());
                self.x7 = add(self.x3, self.x7.permute_badc());

                let t0 = shlxor::<7>(self.x2);
                let t1 = shlxor::<7>(self.x3);
                let t2 = shlxor::<7>(self.x0);
                let t3 = shlxor::<7>(self.x1);

                self.x0 = xor(t0, self.x4);
                self.x1 = xor(t1, self.x5);
                self.x2 = xor(t2, self.x6);
                self.x3 = xor(t3, self.x7);

                self.x4 = add(self.x0, self.x4.permute_cdab());
                self.x5 = add(self.x1, self.x5.permute_cdab());
                self.x6 = add(self.x2, self.x6.permute_cdab());
                self.x7 = add(self.x3, self.x7.permute_cdab());

                let u0 = shlxor::<11>(self.x1);
                let u1 = shlxor::<11>(self.x0);
                let u2 = shlxor::<11>(self.x3);
                let u3 = shlxor::<11>(self.x2);

                self.x0 = xor(u0, self.x4);
                self.x1 = xor(u1, self.x5);
                self.x2 = xor(u2, self.x6);
                self.x3 = xor(u3, self.x7);
            }
        }
    }

    impl Backend for Scalar {
        fn new(params: CubeHashParams) -> Self {
            let (irounds, _frounds) = rounds_for_rev(params.revision);
            let mut st = Scalar {
                x0: U32x4::new(0, ROUNDS as u32, BLOCKSIZE as u32, (params.hash_len_bits / 8) as u32),
                x1: U32x4::new(0, 0, 0, 0),
                x2: U32x4::new(0, 0, 0, 0),
                x3: U32x4::new(0, 0, 0, 0),
                x4: U32x4::new(0, 0, 0, 0),
                x5: U32x4::new(0, 0, 0, 0),
                x6: U32x4::new(0, 0, 0, 0),
                x7: U32x4::new(0, 0, 0, 0),
            };
            for _ in 0..(irounds / ROUNDS) { st.rounds(); }
            st
        }

        fn absorb_block(&mut self, block32: &[u8]) {
            debug_assert_eq!(block32.len(), BLOCKSIZE);
            let m0 = U32x4::load_bytes(&block32[..16]);
            let m1 = U32x4::load_bytes(&block32[16..32]);
            self.x0 = xor(self.x0, m0);
            self.x1 = xor(self.x1, m1);
            self.rounds();
        }

        fn set_finalize_flag(&mut self) {
            self.x7 = xor(self.x7, U32x4::new(0, 1, 0, 0));
        }

        fn rounds_only(&mut self) { self.rounds(); }

        fn output_full(&self) -> [u8; 64] {
            let mut out = [0u8; 64];
            out[0..16].copy_from_slice(&self.x0.transmute());
            out[16..32].copy_from_slice(&self.x1.transmute());
            out[32..48].copy_from_slice(&self.x2.transmute());
            out[48..64].copy_from_slice(&self.x3.transmute());
            out
        }
    }
}

// Re-export at crate level
#[cfg(any(feature = "force-scalar", all(target_arch = "wasm32", not(target_feature = "simd128"))
    , not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64", target_arch = "wasm32"))))]
pub use scalar_backend::Scalar;