rapidhash 3.1.0

A rust port of rapidhash: an extremely fast, high quality, platform-independent hashing algorithm.
Documentation
use core::hash::{BuildHasher, Hasher};
use super::DEFAULT_RAPID_SECRETS;
use super::mix_np::rapid_mix_np;
use super::rapid_const::rapidhash_core;
use super::seed::rapidhash_seed;

/// This function needs to be as small as possible to have as high a chance of being inlined as
/// possible.
///
/// We try to generate the least amount of LLVM-IR code to reduce the inlining cost. Rust should
/// remove the const statements before generating the LLVM-IR.
macro_rules! write_num {
    ($write_num:ident, $int:ident, $unsigned:ident) => {

        /// Write an integer to the hasher, marked as `#[inline(always)]`.
        #[inline(always)]
        fn $write_num(&mut self, i: $int) {
            const N: u8 = core::mem::size_of::<$int>() as u8 * 8;
            if SPONGE {
                // This early u128 conversion seems to be important on x86, as if it impacts the
                // LLVM inlining cost too much to have it inside the if statement...
                // The compiler also converts an i32 -> i128 -> u128 unless we coerce it into its
                // unsigned type first.
                let bytes = (i as $unsigned) as u128;
                if self.sponge_len + N <= 128 {
                    // HOT: add the bytes into the sponge
                    self.sponge |= bytes << self.sponge_len;
                    self.sponge_len += N;
                } else {
                    // COLD: sponge is full, so we need to flush it
                    let a = self.sponge as u64;
                    let b = (self.sponge >> 64) as u64;
                    self.seed = rapid_mix_np::<PROTECTED>(a ^ self.seed, b ^ self.secrets[0]);
                    self.sponge = bytes;
                    self.sponge_len = N;
                }
            } else {
                // slower but high-quality rapidhash
                let bytes = (i as $unsigned) as u64;
                self.seed = rapid_mix_np::<PROTECTED>(bytes ^ self.secrets[0], bytes ^ self.seed);
            }
        }
    };
}

/// A [Hasher] trait compatible hasher that uses the [rapidhash](https://github.com/Nicoshev/rapidhash)
/// algorithm, and uses `#[inline(always)]` for all methods.
///
/// Using `#[inline(always)]` can deliver a large performance improvement when hashing complex
/// objects, but should be benchmarked for your specific use case. If you have HashMaps for many
/// different types this may come at the cost of some binary size increase.
///
/// See [crate::RapidHasher] for default non-forced inline methods.
///
/// See [RapidHashBuilder] for usage with [std::collections::HashMap].
///
/// # Example
/// ```
/// use std::hash::Hasher;
///
/// use rapidhash::quality::RapidHasher;
///
/// let mut hasher = RapidHasher::default();
/// hasher.write(b"hello world");
/// let hash = hasher.finish();
/// ```
#[derive(Copy, Clone)]
pub struct RapidHasher<const AVALANCHE: bool, const SPONGE: bool, const COMPACT: bool = false, const PROTECTED: bool = false> {
    seed: u64,
    secrets: &'static [u64; 7],  // FUTURE: non-static secrets?
    sponge: u128,
    sponge_len: u8,
}

/// A [std::hash::BuildHasher] trait compatible hasher that uses the [RapidHasher] algorithm.
///
/// This is an alias for [`std::hash::BuildHasherDefault<RapidHasher>`] with a static seed.
///
/// Note there that [crate::RapidRandomState] with can be used instead for a
/// [std::hash::BuildHasher] that initialises with a random seed.
///
/// # Example
/// ```
/// use std::collections::HashMap;
/// use std::hash::Hasher;
///
/// use rapidhash::quality::RapidBuildHasher;
///
/// let mut map = HashMap::with_hasher(RapidBuildHasher::default());
/// map.insert(42, "the answer");
/// ```
#[derive(Copy, Clone, Eq, PartialEq)]
pub struct RapidBuildHasher<const AVALANCHE: bool, const SPONGE: bool, const COMPACT: bool = false, const PROTECTED: bool = false> {
    seed: u64,
    secrets: &'static [u64; 7],
}

impl<const AVALANCHE: bool, const SPONGE: bool, const COMPACT: bool, const PROTECTED: bool> RapidBuildHasher<AVALANCHE, SPONGE, COMPACT, PROTECTED> {
    /// New rapid inline build hasher, and pre-compute the seed.
    #[inline(always)]
    pub const fn new(mut seed: u64) -> Self {
        seed = rapidhash_seed(seed);
        Self { seed, secrets: &DEFAULT_RAPID_SECRETS.secrets }
    }
}

impl<const AVALANCHE: bool, const SPONGE: bool, const COMPACT: bool, const PROTECTED: bool> BuildHasher for RapidBuildHasher<AVALANCHE, SPONGE, COMPACT, PROTECTED> {
    type Hasher = RapidHasher<AVALANCHE, SPONGE, COMPACT, PROTECTED>;

    #[inline(always)]
    fn build_hasher(&self) -> Self::Hasher {
        Self::Hasher::new_precomputed_seed(self.seed, self.secrets)
    }
}

impl<const AVALANCHE: bool, const SPONGE: bool, const COMPACT: bool, const PROTECTED: bool> Default for RapidBuildHasher<AVALANCHE, SPONGE, COMPACT, PROTECTED> {
    #[inline]
    fn default() -> Self {
        Self::new(RapidHasher::<AVALANCHE, SPONGE, COMPACT, PROTECTED>::DEFAULT_SEED)
    }
}

impl<const AVALANCHE: bool, const SPONGE: bool, const COMPACT: bool, const PROTECTED: bool> RapidHasher<AVALANCHE, SPONGE, COMPACT, PROTECTED> {
    /// Default `RapidHasher` seed.
    pub const DEFAULT_SEED: u64 = super::seed::DEFAULT_SEED;

    /// Create a new [RapidHasher] with a custom seed.
    ///
    /// See instead [src::quality::RandomState::new] or [src::fast::RandomState::new] for a random
    /// seed and random secret initialisation, for minimal DoS resistance.
    #[inline(always)]
    #[must_use]
    pub const fn new(mut seed: u64) -> Self {
        // do most of the rapidhash_seed initialisation here to avoid doing it on each int
        seed = rapidhash_seed(seed);
        Self::new_precomputed_seed(seed, &DEFAULT_RAPID_SECRETS.secrets)
    }

    #[inline(always)]
    #[must_use]
    pub(super) const fn new_precomputed_seed(seed: u64, secrets: &'static [u64; 7]) -> Self {
        Self {
            seed,
            secrets,
            sponge: 0,
            sponge_len: 0,
        }
    }

    /// Create a new [RapidHasher] using the default seed and secrets.
    #[inline(always)]
    #[must_use]
    pub const fn default_const() -> Self {
        Self::new(Self::DEFAULT_SEED)
    }
}

impl<const AVALANCHE: bool, const SPONGE: bool, const COMPACT: bool, const PROTECTED: bool> Default for RapidHasher<AVALANCHE, SPONGE, COMPACT, PROTECTED> {
    /// Create a new [RapidHasher] with the default seed.
    ///
    /// See [crate::RapidRandomState] for a [std::hash::BuildHasher] that initialises with a random
    /// seed.
    #[inline(always)]
    fn default() -> Self {
        Self::new(super::seed::DEFAULT_SEED)
    }
}

/// This implementation implements methods for all integer types as the compiler will (hopefully...)
/// inline and heavily optimize the rapidhash_core for each. Where the bytes length is known the
/// compiler can make significant optimisations and saves us writing them out by hand.
impl<const AVALANCHE: bool, const SPONGE: bool, const COMPACT: bool, const PROTECTED: bool> Hasher for RapidHasher<AVALANCHE, SPONGE, COMPACT, PROTECTED> {
    /// Produce the final hash value, marked as `#[inline(always)]`.
    #[inline(always)]
    fn finish(&self) -> u64 {
        // written to minimise the LLVM-IR lines, as rust should remove the const if statements
        if SPONGE {
            if !AVALANCHE {
                if self.sponge_len > 0 {
                    let a = self.sponge as u64;
                    let b = (self.sponge >> 64) as u64;
                    rapid_mix_np::<PROTECTED>(a ^ self.seed, b ^ self.secrets[0])
                } else {
                    self.seed
                }
            } else {
                let mut seed = self.seed;
                if self.sponge_len > 0 {
                    let a = self.sponge as u64;
                    let b = (self.sponge >> 64) as u64;
                    seed = rapid_mix_np::<PROTECTED>(a ^ self.seed, b ^ self.secrets[0]);
                }
                // FUTURE: revisit when write_str is stable, as we'd want to move this into the
                // above if statement
                rapid_mix_np::<PROTECTED>(seed, DEFAULT_RAPID_SECRETS.secrets[1])
            }
        } else {
            if !AVALANCHE {
                self.seed
            } else {
                rapid_mix_np::<PROTECTED>(self.seed, DEFAULT_RAPID_SECRETS.secrets[1])
            }
        }
    }

    /// Write a byte slice to the hasher, marked as `#[inline(always)]`.
    #[inline(always)]
    fn write(&mut self, bytes: &[u8]) {
        self.seed = rapidhash_core::<AVALANCHE, COMPACT, PROTECTED>(self.seed, self.secrets, bytes);
    }

    write_num!(write_u8, u8, u8);
    write_num!(write_u16, u16, u16);
    write_num!(write_u32, u32, u32);
    write_num!(write_u64, u64, u64);
    write_num!(write_usize, usize, usize);
    write_num!(write_i8, i8, u8);
    write_num!(write_i16, i16, u16);
    write_num!(write_i32, i32, u32);
    write_num!(write_i64, i64, u64);
    write_num!(write_isize, isize, usize);

    /// Write an int to the hasher, marked as `#[inline(always)]`.
    #[inline(always)]
    fn write_u128(&mut self, i: u128) {
        let a = i as u64;
        let b = (i >> 64) as u64;
        self.seed = rapid_mix_np::<PROTECTED>(a ^ self.seed, b ^ self.secrets[0]);
    }

    /// Write an int to the hasher, marked as `#[inline(always)]`.
    #[inline(always)]
    fn write_i128(&mut self, i: i128) {
        let a = (i as u128) as u64;
        let b = (i as u128 >> 64) as u64;
        self.seed = rapid_mix_np::<PROTECTED>(a ^ self.seed, b ^ self.secrets[0]);
    }

    // TODO: write_str override once stable; nightly feature; or rustversion cfg
    // #[cfg(feature = "nightly")]
    // #[inline(always)]
    // fn write_str(&mut self, s: &str) {
    //     self.write(s.as_bytes());
    // }
}

#[cfg(test)]
mod tests {
    extern crate std;
    use super::*;

    #[test]
    fn test_hasher_size() {
        assert_eq!(core::mem::size_of::<RapidHasher::<true, true, false, false>>(), 48);
    }

    /// Test that writing a single u64 outputs the same as writing the equivalent bytes.
    ///
    /// Does not apply if the algorithm is using a sponge.
    #[ignore]
    #[test]
    fn test_hasher_write_u64() {
        assert_eq!((8 & 24) >> (8 >> 3), 4);

        let ints = [1234u64, 0, 1, u64::MAX, u64::MAX - 2385962040453523];

        for int in ints {
            let mut hasher = RapidHasher::<true, false>::default();
            hasher.write(int.to_le_bytes().as_slice());
            let a = hasher.finish();

            assert_eq!(int.to_le_bytes().as_slice().len(), 8);

            let mut hasher = RapidHasher::<true, false>::default();
            hasher.write_u64(int);
            let b = hasher.finish();

            assert_eq!(a, b, "Mismatching hash for u64 with input {int}");
        }
    }

    /// Check the number of collisions when writing numbers.
    #[test]
    #[ignore]
    #[cfg(feature = "std")]
    fn test_num_collisions() {
        let builder = RapidBuildHasher::<true, false>::default();
        let mut collisions = 0;
        let mut set = std::collections::HashSet::new();
        for i in 0..=u16::MAX {
            let hash_u16 = builder.hash_one(i) & 0xFFFFFF;
            if set.contains(&hash_u16) {
                collisions += 1;
            } else {
                set.insert(hash_u16);
            }

            // if i < 256 {
            //     let hash_u8 = builder.hash_one(i as u8) & 0xFFFF;
            //     if set.contains(&hash_u8) {
            //         collisions += 1;
            //     } else {
            //         set.insert(hash_u8);
            //     }
            // }
        }
        assert_eq!(collisions, 0, "Collisions found when hashing numbers");
    }
}