twox_hash/
xxhash32.rs

1//! The implementation of XXH32.
2
3use core::{
4    fmt,
5    hash::{self, BuildHasher},
6    mem,
7};
8
9use crate::{IntoU32, IntoU64};
10
11// Keeping these constants in this form to match the C code.
12const PRIME32_1: u32 = 0x9E3779B1;
13const PRIME32_2: u32 = 0x85EBCA77;
14const PRIME32_3: u32 = 0xC2B2AE3D;
15const PRIME32_4: u32 = 0x27D4EB2F;
16const PRIME32_5: u32 = 0x165667B1;
17
18type Lane = u32;
19type Lanes = [Lane; 4];
20type Bytes = [u8; 16];
21
22const BYTES_IN_LANE: usize = mem::size_of::<Bytes>();
23
24#[derive(Clone, PartialEq)]
25struct BufferData(Lanes);
26
27impl BufferData {
28    const fn new() -> Self {
29        Self([0; 4])
30    }
31
32    const fn bytes(&self) -> &Bytes {
33        const _: () = assert!(mem::align_of::<u8>() <= mem::align_of::<Lane>());
34        // SAFETY[bytes]: The alignment of `u32` is at least that of
35        // `u8` and all the values are initialized.
36        unsafe { &*self.0.as_ptr().cast() }
37    }
38
39    fn bytes_mut(&mut self) -> &mut Bytes {
40        // SAFETY: See SAFETY[bytes]
41        unsafe { &mut *self.0.as_mut_ptr().cast() }
42    }
43}
44
45impl fmt::Debug for BufferData {
46    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
47        f.debug_list().entries(self.0.iter()).finish()
48    }
49}
50
51#[derive(Debug, Clone, PartialEq)]
52struct Buffer {
53    offset: usize,
54    data: BufferData,
55}
56
57impl Buffer {
58    const fn new() -> Self {
59        Self {
60            offset: 0,
61            data: BufferData::new(),
62        }
63    }
64
65    // RATIONALE: See RATIONALE[inline]
66    #[inline]
67    fn extend<'d>(&mut self, data: &'d [u8]) -> (Option<&Lanes>, &'d [u8]) {
68        // Most of the slice methods we use here have `_unchecked` variants, but
69        //
70        // 1. this method is called one time per `Hasher::write` call
71        // 2. this method early exits if we don't have anything in the buffer
72        //
73        // Because of this, removing the panics via `unsafe` doesn't
74        // have much benefit other than reducing code size by a tiny
75        // bit.
76
77        if self.offset == 0 {
78            return (None, data);
79        };
80
81        let bytes = self.data.bytes_mut();
82        debug_assert!(self.offset <= bytes.len());
83
84        let empty = &mut bytes[self.offset..];
85        let n_to_copy = usize::min(empty.len(), data.len());
86
87        let dst = &mut empty[..n_to_copy];
88
89        let (src, rest) = data.split_at(n_to_copy);
90
91        dst.copy_from_slice(src);
92        self.offset += n_to_copy;
93
94        debug_assert!(self.offset <= bytes.len());
95
96        if self.offset == bytes.len() {
97            self.offset = 0;
98            (Some(&self.data.0), rest)
99        } else {
100            (None, rest)
101        }
102    }
103
104    // RATIONALE: See RATIONALE[inline]
105    #[inline]
106    fn set(&mut self, data: &[u8]) {
107        if data.is_empty() {
108            return;
109        }
110
111        debug_assert_eq!(self.offset, 0);
112
113        let n_to_copy = data.len();
114
115        let bytes = self.data.bytes_mut();
116        debug_assert!(n_to_copy < bytes.len());
117
118        bytes[..n_to_copy].copy_from_slice(data);
119        self.offset = data.len();
120    }
121
122    // RATIONALE: See RATIONALE[inline]
123    #[inline]
124    fn remaining(&self) -> &[u8] {
125        &self.data.bytes()[..self.offset]
126    }
127}
128
129#[derive(Clone, PartialEq)]
130struct Accumulators(Lanes);
131
132impl Accumulators {
133    const fn new(seed: u32) -> Self {
134        Self([
135            seed.wrapping_add(PRIME32_1).wrapping_add(PRIME32_2),
136            seed.wrapping_add(PRIME32_2),
137            seed,
138            seed.wrapping_sub(PRIME32_1),
139        ])
140    }
141
142    // RATIONALE: See RATIONALE[inline]
143    #[inline]
144    fn write(&mut self, lanes: Lanes) {
145        let [acc1, acc2, acc3, acc4] = &mut self.0;
146        let [lane1, lane2, lane3, lane4] = lanes;
147
148        *acc1 = round(*acc1, lane1.to_le());
149        *acc2 = round(*acc2, lane2.to_le());
150        *acc3 = round(*acc3, lane3.to_le());
151        *acc4 = round(*acc4, lane4.to_le());
152    }
153
154    // RATIONALE: See RATIONALE[inline]
155    #[inline]
156    fn write_many<'d>(&mut self, mut data: &'d [u8]) -> &'d [u8] {
157        while let Some((chunk, rest)) = data.split_first_chunk::<BYTES_IN_LANE>() {
158            // SAFETY: We have the right number of bytes and are
159            // handling the unaligned case.
160            let lanes = unsafe { chunk.as_ptr().cast::<Lanes>().read_unaligned() };
161            self.write(lanes);
162            data = rest;
163        }
164        data
165    }
166
167    // RATIONALE: See RATIONALE[inline]
168    #[inline]
169    const fn finish(&self) -> u32 {
170        let [acc1, acc2, acc3, acc4] = self.0;
171
172        let acc1 = acc1.rotate_left(1);
173        let acc2 = acc2.rotate_left(7);
174        let acc3 = acc3.rotate_left(12);
175        let acc4 = acc4.rotate_left(18);
176
177        acc1.wrapping_add(acc2)
178            .wrapping_add(acc3)
179            .wrapping_add(acc4)
180    }
181}
182
183impl fmt::Debug for Accumulators {
184    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
185        let [acc1, acc2, acc3, acc4] = self.0;
186        f.debug_struct("Accumulators")
187            .field("acc1", &acc1)
188            .field("acc2", &acc2)
189            .field("acc3", &acc3)
190            .field("acc4", &acc4)
191            .finish()
192    }
193}
194
195/// Calculates the 32-bit hash.
196///
197/// ### Caution
198///
199/// Although this struct implements [`hash::Hasher`][], it only calculates a
200/// 32-bit number, leaving the upper bits as 0. This means it is
201/// unlikely to be correct to use this in places like a [`HashMap`][std::collections::HashMap].
202#[derive(Debug, Clone, PartialEq)]
203pub struct Hasher {
204    seed: u32,
205    accumulators: Accumulators,
206    buffer: Buffer,
207    length: u64,
208}
209
210impl Default for Hasher {
211    fn default() -> Self {
212        Self::with_seed(0)
213    }
214}
215
216impl Hasher {
217    /// Hash all data at once. If you can use this function, you may
218    /// see noticable speed gains for certain types of input.
219    #[must_use]
220    // RATIONALE[inline]: Keeping parallel to the 64-bit
221    // implementation, even though the performance gains for the
222    // 32-bit version haven't been tested.
223    #[inline]
224    pub fn oneshot(seed: u32, data: &[u8]) -> u32 {
225        let len = data.len();
226
227        // Since we know that there's no more data coming, we don't
228        // need to construct the intermediate buffers or copy data to
229        // or from the buffers.
230
231        let mut accumulators = Accumulators::new(seed);
232
233        let data = accumulators.write_many(data);
234
235        Self::finish_with(seed, len.into_u64(), &accumulators, data)
236    }
237
238    /// Constructs the hasher with an initial seed.
239    #[must_use]
240    pub const fn with_seed(seed: u32) -> Self {
241        // Step 1. Initialize internal accumulators
242        Self {
243            seed,
244            accumulators: Accumulators::new(seed),
245            buffer: Buffer::new(),
246            length: 0,
247        }
248    }
249
250    /// The seed this hasher was created with.
251    pub const fn seed(&self) -> u32 {
252        self.seed
253    }
254
255    /// The total number of bytes hashed.
256    pub const fn total_len(&self) -> u64 {
257        self.length
258    }
259
260    /// The total number of bytes hashed, truncated to 32 bits.
261    ///
262    /// For the full 64-bit byte count, use [`total_len`](Self::total_len).
263    pub const fn total_len_32(&self) -> u32 {
264        self.length as u32
265    }
266
267    /// Returns the hash value for the values written so far. Unlike
268    /// [`hash::Hasher::finish`][], this method returns the actual 32-bit
269    /// value calculated, not a 64-bit value.
270    #[must_use]
271    // RATIONALE: See RATIONALE[inline]
272    #[inline]
273    pub fn finish_32(&self) -> u32 {
274        Self::finish_with(
275            self.seed,
276            self.length,
277            &self.accumulators,
278            self.buffer.remaining(),
279        )
280    }
281
282    #[must_use]
283    // RATIONALE: See RATIONALE[inline]
284    #[inline]
285    fn finish_with(seed: u32, len: u64, accumulators: &Accumulators, mut remaining: &[u8]) -> u32 {
286        // Step 3. Accumulator convergence
287        let mut acc = if len < BYTES_IN_LANE.into_u64() {
288            seed.wrapping_add(PRIME32_5)
289        } else {
290            accumulators.finish()
291        };
292
293        // Step 4. Add input length
294        //
295        // "Note that, if input length is so large that it requires
296        // more than 32-bits, only the lower 32-bits are added to the
297        // accumulator."
298        acc += len as u32;
299
300        // Step 5. Consume remaining input
301        while let Some((chunk, rest)) = remaining.split_first_chunk() {
302            let lane = u32::from_ne_bytes(*chunk).to_le();
303
304            acc = acc.wrapping_add(lane.wrapping_mul(PRIME32_3));
305            acc = acc.rotate_left(17).wrapping_mul(PRIME32_4);
306
307            remaining = rest;
308        }
309
310        for &byte in remaining {
311            let lane = byte.into_u32();
312
313            acc = acc.wrapping_add(lane.wrapping_mul(PRIME32_5));
314            acc = acc.rotate_left(11).wrapping_mul(PRIME32_1);
315        }
316
317        // Step 6. Final mix (avalanche)
318        acc ^= acc >> 15;
319        acc = acc.wrapping_mul(PRIME32_2);
320        acc ^= acc >> 13;
321        acc = acc.wrapping_mul(PRIME32_3);
322        acc ^= acc >> 16;
323
324        acc
325    }
326}
327
328impl hash::Hasher for Hasher {
329    // RATIONALE: See RATIONALE[inline]
330    #[inline]
331    fn write(&mut self, data: &[u8]) {
332        let len = data.len();
333
334        // Step 2. Process stripes
335        let (buffered_lanes, data) = self.buffer.extend(data);
336
337        if let Some(&lanes) = buffered_lanes {
338            self.accumulators.write(lanes);
339        }
340
341        let data = self.accumulators.write_many(data);
342
343        self.buffer.set(data);
344
345        self.length += len.into_u64();
346    }
347
348    // RATIONALE: See RATIONALE[inline]
349    #[inline]
350    fn finish(&self) -> u64 {
351        Hasher::finish_32(self).into()
352    }
353}
354
355// RATIONALE: See RATIONALE[inline]
356#[inline]
357const fn round(mut acc: u32, lane: u32) -> u32 {
358    acc = acc.wrapping_add(lane.wrapping_mul(PRIME32_2));
359    acc = acc.rotate_left(13);
360    acc.wrapping_mul(PRIME32_1)
361}
362
363/// Constructs [`Hasher`][] for multiple hasher instances. See
364/// the [usage warning][Hasher#caution].
365#[derive(Clone)]
366pub struct State(u32);
367
368impl State {
369    /// Constructs the hasher with an initial seed.
370    pub fn with_seed(seed: u32) -> Self {
371        Self(seed)
372    }
373}
374
375impl BuildHasher for State {
376    type Hasher = Hasher;
377
378    fn build_hasher(&self) -> Self::Hasher {
379        Hasher::with_seed(self.0)
380    }
381}
382
383#[cfg(test)]
384mod test {
385    use core::{
386        array,
387        hash::{BuildHasherDefault, Hasher as _},
388    };
389    use std::collections::HashMap;
390
391    use super::*;
392
393    const _TRAITS: () = {
394        const fn is_clone<T: Clone>() {}
395        is_clone::<Hasher>();
396        is_clone::<State>();
397    };
398
399    const EMPTY_BYTES: [u8; 0] = [];
400
401    #[test]
402    fn ingesting_byte_by_byte_is_equivalent_to_large_chunks() {
403        let bytes = [0; 32];
404
405        let mut byte_by_byte = Hasher::with_seed(0);
406        for byte in bytes.chunks(1) {
407            byte_by_byte.write(byte);
408        }
409        let byte_by_byte = byte_by_byte.finish();
410
411        let mut one_chunk = Hasher::with_seed(0);
412        one_chunk.write(&bytes);
413        let one_chunk = one_chunk.finish();
414
415        assert_eq!(byte_by_byte, one_chunk);
416    }
417
418    #[test]
419    fn hash_of_nothing_matches_c_implementation() {
420        let mut hasher = Hasher::with_seed(0);
421        hasher.write(&EMPTY_BYTES);
422        assert_eq!(hasher.finish(), 0x02cc_5d05);
423    }
424
425    #[test]
426    fn hash_of_single_byte_matches_c_implementation() {
427        let mut hasher = Hasher::with_seed(0);
428        hasher.write(&[42]);
429        assert_eq!(hasher.finish(), 0xe0fe_705f);
430    }
431
432    #[test]
433    fn hash_of_multiple_bytes_matches_c_implementation() {
434        let mut hasher = Hasher::with_seed(0);
435        hasher.write(b"Hello, world!\0");
436        assert_eq!(hasher.finish(), 0x9e5e_7e93);
437    }
438
439    #[test]
440    fn hash_of_multiple_chunks_matches_c_implementation() {
441        let bytes: [u8; 100] = array::from_fn(|i| i as u8);
442        let mut hasher = Hasher::with_seed(0);
443        hasher.write(&bytes);
444        assert_eq!(hasher.finish(), 0x7f89_ba44);
445    }
446
447    #[test]
448    fn hash_with_different_seed_matches_c_implementation() {
449        let mut hasher = Hasher::with_seed(0x42c9_1977);
450        hasher.write(&EMPTY_BYTES);
451        assert_eq!(hasher.finish(), 0xd6bf_8459);
452    }
453
454    #[test]
455    fn hash_with_different_seed_and_multiple_chunks_matches_c_implementation() {
456        let bytes: [u8; 100] = array::from_fn(|i| i as u8);
457        let mut hasher = Hasher::with_seed(0x42c9_1977);
458        hasher.write(&bytes);
459        assert_eq!(hasher.finish(), 0x6d2f_6c17);
460    }
461
462    #[test]
463    fn hashes_with_different_offsets_are_the_same() {
464        let bytes = [0x7c; 4096];
465        let expected = Hasher::oneshot(0, &[0x7c; 64]);
466
467        let the_same = bytes
468            .windows(64)
469            .map(|w| {
470                let mut hasher = Hasher::with_seed(0);
471                hasher.write(w);
472                hasher.finish_32()
473            })
474            .all(|h| h == expected);
475        assert!(the_same);
476    }
477
478    // This test validates wraparound/truncation behavior for very
479    // large inputs of a 32-bit hash, but runs very slowly in the
480    // normal "cargo test" build config since it hashes 4.3GB of
481    // data. It runs reasonably quick under "cargo test --release".
482    #[ignore]
483    #[test]
484    fn length_overflows_32bit() {
485        // Hash 4.3 billion (4_300_000_000) bytes, which overflows a u32.
486        let bytes200: [u8; 200] = array::from_fn(|i| i as _);
487
488        let mut hasher = Hasher::with_seed(0);
489        for _ in 0..(4_300_000_000 / bytes200.len()) {
490            hasher.write(&bytes200);
491        }
492
493        assert_eq!(hasher.total_len(), 0x0000_0001_004c_cb00);
494        assert_eq!(hasher.total_len_32(), 0x004c_cb00);
495
496        // compared against the C implementation
497        assert_eq!(hasher.finish(), 0x1522_4ca7);
498    }
499
500    #[test]
501    fn can_be_used_in_a_hashmap_with_a_default_seed() {
502        let mut hash: HashMap<_, _, BuildHasherDefault<Hasher>> = Default::default();
503        hash.insert(42, "the answer");
504        assert_eq!(hash.get(&42), Some(&"the answer"));
505    }
506}
507
508#[cfg(feature = "random")]
509#[cfg_attr(docsrs, doc(cfg(feature = "random")))]
510mod random_impl {
511    use super::*;
512
513    /// Constructs a randomized seed and reuses it for multiple hasher
514    /// instances. See the [usage warning][Hasher#caution].
515    #[derive(Clone)]
516    pub struct RandomState(State);
517
518    impl Default for RandomState {
519        fn default() -> Self {
520            Self::new()
521        }
522    }
523
524    impl RandomState {
525        fn new() -> Self {
526            Self(State::with_seed(rand::random()))
527        }
528    }
529
530    impl BuildHasher for RandomState {
531        type Hasher = Hasher;
532
533        fn build_hasher(&self) -> Self::Hasher {
534            self.0.build_hasher()
535        }
536    }
537
538    #[cfg(test)]
539    mod test {
540        use std::collections::HashMap;
541
542        use super::*;
543
544        const _: () = {
545            const fn is_clone<T: Clone>() {}
546            is_clone::<Hasher>();
547            is_clone::<RandomState>();
548        };
549
550        #[test]
551        fn can_be_used_in_a_hashmap_with_a_random_seed() {
552            let mut hash: HashMap<_, _, RandomState> = Default::default();
553            hash.insert(42, "the answer");
554            assert_eq!(hash.get(&42), Some(&"the answer"));
555        }
556    }
557}
558
559#[cfg(feature = "random")]
560#[cfg_attr(docsrs, doc(cfg(feature = "random")))]
561pub use random_impl::*;
562
563#[cfg(feature = "serialize")]
564#[cfg_attr(docsrs, doc(cfg(feature = "serialize")))]
565mod serialize_impl {
566    use serde::{Deserialize, Serialize};
567
568    use super::*;
569
570    impl<'de> Deserialize<'de> for Hasher {
571        fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
572        where
573            D: serde::Deserializer<'de>,
574        {
575            let shim = Deserialize::deserialize(deserializer)?;
576
577            let Shim {
578                total_len,
579                seed,
580                core,
581                buffer,
582                buffer_usage,
583            } = shim;
584            let Core { v1, v2, v3, v4 } = core;
585
586            let mut buffer_data = BufferData::new();
587            buffer_data.bytes_mut().copy_from_slice(&buffer);
588
589            Ok(Hasher {
590                seed,
591                accumulators: Accumulators([v1, v2, v3, v4]),
592                buffer: Buffer {
593                    offset: buffer_usage,
594                    data: buffer_data,
595                },
596                length: total_len,
597            })
598        }
599    }
600
601    impl Serialize for Hasher {
602        fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
603        where
604            S: serde::Serializer,
605        {
606            let Hasher {
607                seed,
608                ref accumulators,
609                ref buffer,
610                length,
611            } = *self;
612            let [v1, v2, v3, v4] = accumulators.0;
613            let Buffer { offset, ref data } = *buffer;
614            let buffer = *data.bytes();
615
616            let shim = Shim {
617                total_len: length,
618                seed,
619                core: Core { v1, v2, v3, v4 },
620                buffer,
621                buffer_usage: offset,
622            };
623
624            shim.serialize(serializer)
625        }
626    }
627
628    #[derive(Serialize, Deserialize)]
629    struct Shim {
630        total_len: u64,
631        seed: u32,
632        core: Core,
633        buffer: [u8; 16],
634        buffer_usage: usize,
635    }
636
637    #[derive(Serialize, Deserialize)]
638    struct Core {
639        v1: u32,
640        v2: u32,
641        v3: u32,
642        v4: u32,
643    }
644
645    #[cfg(test)]
646    mod test {
647        use std::hash::Hasher as _;
648
649        use super::*;
650
651        type Result<T = (), E = serde_json::Error> = core::result::Result<T, E>;
652
653        #[test]
654        fn test_serialization_cycle() -> Result {
655            let mut hasher = Hasher::with_seed(0);
656            hasher.write(b"Hello, world!\0");
657            let _ = hasher.finish();
658
659            let serialized = serde_json::to_string(&hasher)?;
660            let unserialized: Hasher = serde_json::from_str(&serialized)?;
661            assert_eq!(hasher, unserialized);
662            Ok(())
663        }
664
665        #[test]
666        fn test_serialization_stability() -> Result {
667            let mut hasher = Hasher::with_seed(0);
668            hasher.write(b"Hello, world!\0");
669            let _ = hasher.finish();
670
671            let expected_serialized = r#"{
672                "total_len": 14,
673                "seed": 0,
674                "core": {
675                  "v1": 606290984,
676                  "v2": 2246822519,
677                  "v3": 0,
678                  "v4": 1640531535
679                },
680                "buffer": [
681                  72,  101, 108, 108, 111, 44, 32, 119,
682                  111, 114, 108, 100, 33,  0,  0,  0
683                ],
684                "buffer_usage": 14
685            }"#;
686
687            let unserialized: Hasher = serde_json::from_str(expected_serialized)?;
688            assert_eq!(hasher, unserialized);
689
690            let expected_value: serde_json::Value = serde_json::from_str(expected_serialized)?;
691            let actual_value = serde_json::to_value(&hasher)?;
692            assert_eq!(expected_value, actual_value);
693
694            Ok(())
695        }
696    }
697}