1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
//! A fast, deterministic, non-cryptographic hash for use in hash tables.
//!
//! ZwoHash implements a very fast hash algorithm optimized for the use in hash tables. It has low
//! per-hash overhead, which is important when hashing small keys. It is non-cryptographic and
//! deterministic and as such not suited for inserting untrusted user-provided input into hash
//! tables, unless other denial of service countermeasures are taken. As such it covers the same use
//! cases as [rustc's FxHash][rustc_hash].
//!
//! Compared to FxHash, ZwoHash provides essentially the same hashing speed while aiming for more
//! uniform outputs. When used in a hash table ZwoHash is almost always able to match the
//! performance of FxHash while outperforming it by quite a bit for some common inputs for which
//! FxHash's output is particularly poor.
//!
//! The hash algorithm used by ZwoHash is very similar to that of FxHash, both process one `usize`
//! at a time and perform the same number and kind of operations per `usize`. ZwoHash though,
//! replaces the last iteration with a slightly more expensive operation that provides better output
//! guarantees. The additional overhead (independent of the size of the hashed data) consists of
//! performing a wide multiplication instead of a truncated multiplication and one additional
//! subtraction. This is very little overhead, and almost doesn't register when using ZwoHash in a
//! hash table.
//!
//! ZwoHash guarantees that any input bit can affect any bit of the output. FxHash does not
//! guarantee this, and even beyond that, ZwoHash's output is more uniform. When used in a hash
//! table, this often reduces the number of collisions and thus the number of required probes for
//! each access. This can result in ZwoHash outperforming FxHash in that setting.
//!
//! Sometimes, given inputs for which FxHash is especially ill-suited, ZwoHash outperforms FxHash by
//! a large margin. This includes integer keys that all are a multiple of a power of two, floating
//! point values with a short base-2 representation, pointers returned from the allocator and other
//! inputs that only differ in the higher bits of the last processed `usize`.
//!
//! [rustc_hash]: https://crates.io/crates/rustc-hash
#![no_std]

#[cfg(feature = "std")]
extern crate std;

#[cfg(any(feature = "std"))]
use core::hash::BuildHasherDefault;
use core::{convert::TryInto, hash::Hasher};

#[cfg(any(feature = "std"))]
use std::collections;

/// A [`collections::HashMap`] using [`ZwoHasher`] to compute hashes.
#[cfg(all(feature = "std"))]
pub type HashMap<K, V> = collections::HashMap<K, V, BuildHasherDefault<ZwoHasher>>;
/// A [`collections::HashSet`] using [`ZwoHasher`] to compute hashes.
#[cfg(all(feature = "std"))]
pub type HashSet<V> = collections::HashSet<V, BuildHasherDefault<ZwoHasher>>;

/// A fast, deterministic, non-cryptographic hash for use in hash tables.
///
/// Can be constructed using [`Default`] and then used using [`Hasher`]. See the [`crate`]'s
/// documentation for more information.
pub struct ZwoHasher {
    state: usize,
}

impl Default for ZwoHasher {
    #[inline]
    fn default() -> ZwoHasher {
        ZwoHasher { state: 0 }
    }
}

// Taken from Pierre L’Ecuyer. 1999. Tables of Linear Congruential Generators of Different Sizes and
// Good Lattice Structure.
//
// This is a bit silly, because the xoring of input words and the rotation (see write_usize below)
// means that this isn't really related to an LCG. Nevertheless these constants seem to perform
// well, slightly better than a few other choices I tried. It might be worth to more systematically
// explore the possible choices here.
#[cfg(target_pointer_width = "64")]
const M: usize = 0x2545f4914f6cdd1d;
#[cfg(target_pointer_width = "32")]
const M: usize = 0x2c9277b5;

// These values are chosen as the nearest integer to `bits/phi` that is coprime to `bits`. being
// coprime to `bits` means the commulated rotation offset cycles through all bit positions before
// repeating, being close to `bits/phi` means the sequence of commulated rotation offsets is
// distributed evenly.
#[cfg(target_pointer_width = "64")]
const R: u32 = 41;
#[cfg(target_pointer_width = "32")]
const R: u32 = 21;

#[cfg(target_pointer_width = "64")]
type WideInt = u128;
#[cfg(target_pointer_width = "32")]
type WideInt = u64;

const USIZE_BITS: u32 = 0usize.count_zeros();
const USIZE_BYTES: usize = core::mem::size_of::<usize>();

impl Hasher for ZwoHasher {
    #[inline]
    fn write_usize(&mut self, i: usize) {
        // Every other write is implemented via this function. It differs from FxHash in the used
        // constants and in that we xor the input word at the end. We can do this as we do
        // additional mixing in finish, which FxHash doesn't do. This way if the first write_usize
        // is inlined, the wrapping_mul and rotate_right get const evaluated.
        self.state = self.state.wrapping_mul(M).rotate_right(R) ^ i;
    }

    #[inline]
    fn finish(&self) -> u64 {
        // Our state update (in write_usize) doesn't mix the bits very much. The wrapping_mul only
        // allows lower bits to affect higher bits, which is somewhat mitigated by the rotate_right,
        // but that still requires multiple updates to really mix the bits.
        //
        // Additionally the last added word isn't mixed at all.
        //
        // We can work around both these problems by performing a slightly more expensive but much
        // better mixing here at the end. To do that we don't use wrapping_mul but instead perform a
        // wide multiplication and subtract the high from the low resutling word to get the final
        // hash. This allows any bit of the final state to affect any bit of the output hash.
        //
        // For hashes of short values, e.g. of single ints, this is slightly more expensive than
        // FxHash, even with more const evaluation for the first write_usize. For longer values this
        // is quickly amortized.
        //
        // See the test at the end of this file of what mixing properties this guarantees.
        let wide = (self.state as WideInt) * (M as WideInt);
        (wide as usize).wrapping_sub((wide >> USIZE_BITS) as usize) as u64
    }

    #[inline]
    fn write(&mut self, bytes: &[u8]) {
        // Working on a local copy might make the job of the optimizer compling this easier, but I
        // haven't checked that, this is cargo culted from rustc's FxHash
        let mut copy = ZwoHasher { state: self.state };

        // The code below needs adjustment for other lengths of `usize`
        assert!(USIZE_BYTES == 8 || USIZE_BYTES == 4);

        #[allow(clippy::len_zero)]
        if bytes.len() >= USIZE_BYTES {
            // We iterate over all USIZE_BYTE sized chunks, but skips the last chunk if the data has
            // a length that is an exact multiple of USIZE_BYTES, as we will process that chunk
            // below
            let mut bytes_left = bytes;
            while bytes_left.len() > USIZE_BYTES {
                let full_chunk: [u8; USIZE_BYTES] = bytes_left[..USIZE_BYTES].try_into().unwrap();
                copy.write_usize(usize::from_ne_bytes(full_chunk));
                bytes_left = &bytes_left[USIZE_BYTES..];
            }

            // This check is completely redundand and will always be true, but without it the bounds
            // check when indexing into `bytes` isn't optimzed away. Including this check makes
            // rustc optimize away this check itself and the bounds check when indexing into
            // `bytes`. (Last tested with rustc 1.46.0)
            if bytes.len() >= USIZE_BYTES {
                // This last chunk overlaps with the previously processed chunk if bytes has a
                // length that is not a multiple of USIZE_BYTES, but this is completely fine for
                // hashing
                let last_chunk: [u8; USIZE_BYTES] =
                    bytes[bytes.len() - USIZE_BYTES..].try_into().unwrap();
                copy.write_usize(usize::from_ne_bytes(last_chunk));
            } else {
                core::unreachable!();
            }
        } else if USIZE_BYTES == 8 && bytes.len() >= 4 {
            #[cfg(target_pointer_width = "64")]
            {
                // If we have less than USIZEBYTES = 8 bytes of data, but 4 or more, we can use two
                // overlapping u32 values to cover all of the input data and those fit into a single
                // usize.
                let chunk_low: [u8; 4] = bytes[..4].try_into().unwrap();
                let chunk_high: [u8; 4] = bytes[bytes.len() - 4..].try_into().unwrap();
                let chunk_value = (u32::from_ne_bytes(chunk_low) as usize)
                    | ((u32::from_ne_bytes(chunk_high) as usize) << 32);
                copy.write_usize(chunk_value);
            }
            #[cfg(target_pointer_width = "32")]
            core::unreachable!();
        } else if bytes.len() >= 2 {
            // If we have less than 4 bytes of data but 2 or more, we can use two overlapping u16
            // values to cover all of the input data and those fit into a single usize.
            let chunk_low: [u8; 2] = bytes[..2].try_into().unwrap();
            let chunk_high: [u8; 2] = bytes[bytes.len() - 2..].try_into().unwrap();
            let chunk_value = (u16::from_ne_bytes(chunk_low) as usize)
                | ((u16::from_ne_bytes(chunk_high) as usize) << 16);
            copy.write_usize(chunk_value);
        } else if bytes.len() >= 1 {
            // Otherwise we have at most a single byte left
            copy.write_usize(bytes[0] as usize);
        }

        self.state = copy.state;
    }

    #[inline]
    fn write_u8(&mut self, i: u8) {
        self.write_usize(i as usize);
    }

    #[inline]
    fn write_u16(&mut self, i: u16) {
        self.write_usize(i as usize);
    }

    #[inline]
    fn write_u32(&mut self, i: u32) {
        self.write_usize(i as usize);
    }

    #[cfg(target_pointer_width = "64")]
    #[inline]
    fn write_u64(&mut self, i: u64) {
        self.write_usize(i as usize);
    }

    #[cfg(target_pointer_width = "32")]
    #[inline]
    fn write_u64(&mut self, i: u64) {
        self.write_usize(i as usize);
        self.write_usize((i >> 32) as usize);
    }

    #[inline]
    fn write_u128(&mut self, i: u128) {
        self.write_u64(i as u64);
        self.write_u64((i >> 64) as u64);
    }

    #[inline]
    fn write_i8(&mut self, i: i8) {
        self.write_u8(i as u8);
    }

    #[inline]
    fn write_i16(&mut self, i: i16) {
        self.write_u16(i as u16);
    }

    #[inline]
    fn write_i32(&mut self, i: i32) {
        self.write_u32(i as u32);
    }

    #[inline]
    fn write_i64(&mut self, i: i64) {
        self.write_u64(i as u64);
    }

    #[inline]
    fn write_i128(&mut self, i: i128) {
        self.write_u128(i as u128);
    }

    #[inline]
    fn write_isize(&mut self, i: isize) {
        self.write_usize(i as usize);
    }
}

#[cfg(all(test, feature = "std"))]
mod tests {
    use super::*;
    use std::{prelude::v1::*, println};

    fn hash_usize(value: usize) -> usize {
        let mut hasher = ZwoHasher::default();
        hasher.write_usize(value);
        hasher.finish() as usize
    }

    /// Make sure that for every consecutive 8 bits of the input, over all possible values of those
    /// 8 bits (with the others set to zero), and every consecutive 1 bits of the output, there are
    /// almost no collisions.
    ///
    /// This is a desirable property, especially for consecutive low and high output bits, as these
    /// are used as indices or as filter in hashtables. E.g. the stdlibs hashbrown hash tables takes
    /// a variable number of lower bits as index and use the upper 8 bits to pre-filter entries with
    /// colliding indices.
    #[test]
    fn usize_byte_subbword_collision_rate() {
        let mut histogram = [0; 257];

        for i in 0..USIZE_BITS - 8 {
            for j in 0..USIZE_BITS - 16 {
                let mut hash_subbytes: Vec<_> =
                    (0..256).map(|b| (hash_usize(b << i) >> j) as u16).collect();
                hash_subbytes.sort_unstable();
                hash_subbytes.dedup();
                histogram[hash_subbytes.len()] += 1;
            }
        }

        for (len, &count) in histogram.iter().enumerate() {
            if count > 0 {
                println!("{}: {}", len, count);
            }
        }

        for (len, &count) in histogram.iter().enumerate() {
            // We allow up to one collision
            assert!(len >= 255 || count == 0);
        }
    }
}