gearhash/
lib.rs

1//! The GEAR hashing function is a fast, rolling hash function that
2//! is well suited for content defined chunking. In particular, it is
3//! used as a building block for the [FastCDC](https://www.usenix.org/node/196197)
4//! algorithm.
5//!
6//! The implementation provided in this crate consists of both a simple,
7//! scalar variant, as well as optimized versions for the SSE4.2 and AVX2
8//! instruction sets.
9//!
10//! ## Example
11//!
12//! ```
13//! fn find_all_chunks(buf: &[u8], mask: u64) -> Vec<&[u8]> {
14//!     // set up initial state
15//!     let mut chunks = vec![];
16//!     let mut offset = 0;
17//!
18//!     // create new hasher
19//!     let mut hasher = gearhash::Hasher::default();
20//!
21//!     // loop through all matches, and push the corresponding chunks
22//!     while let Some(boundary) = hasher.next_match(&buf[offset..], mask) {
23//!         chunks.push(&buf[offset..offset + boundary]);
24//!         offset += boundary;
25//!     }
26//!
27//!     // push final chunk
28//!     chunks.push(&buf[offset..]);
29//!     chunks
30//! }
31//! ```
32
33#![cfg_attr(feature = "bench", feature(test))]
34
35#[cfg(feature = "bench")]
36extern crate test;
37#[cfg(feature = "bench")]
38mod bench;
39
40mod scalar;
41mod simd;
42mod table;
43
44#[cfg(fuzzing)]
45pub mod fuzzing;
46
47pub use table::{Table, DEFAULT_TABLE};
48
49/// Gear hash state. Processes bytes to find chunk boundaries.
50#[derive(Clone)]
51pub struct Hasher<'t> {
52    table: &'t Table,
53    hash: u64,
54}
55
56impl<'t> Hasher<'t> {
57    /// Create a new hasher with the given table.
58    pub fn new(table: &'t Table) -> Self {
59        Self { table, hash: 0 }
60    }
61
62    /// Update the hash state by processing all the bytes in the given slice.
63    pub fn update(&mut self, buf: &[u8]) {
64        for b in buf.iter() {
65            self.hash = (self.hash << 1).wrapping_add(self.table[*b as usize]);
66        }
67    }
68
69    /// Match the current hash state against the given mask.
70    ///
71    /// Returns true if `hash & mask == 0`, false otherwise.
72    pub fn is_match(&self, mask: u64) -> bool {
73        self.hash & mask == 0
74    }
75
76    /// Processes the given byte slice until a match is found for the given mask.
77    ///
78    /// If a match is found before the end of the byte slice, it returns the number
79    /// of bytes processed. If no match has been found, it returns `None`.
80    pub fn next_match(&mut self, buf: &[u8], mask: u64) -> Option<usize> {
81        simd::next_match(&mut self.hash, self.table, buf, mask)
82    }
83
84    /// Retrieve the current hash value.
85    pub fn get_hash(&self) -> u64 {
86        self.hash
87    }
88
89    /// Set the hash value to the given integer.
90    pub fn set_hash(&mut self, hash: u64) {
91        self.hash = hash
92    }
93}
94
95impl Default for Hasher<'static> {
96    fn default() -> Self {
97        Hasher::new(&DEFAULT_TABLE)
98    }
99}
100
101impl<'t> std::fmt::Debug for Hasher<'t> {
102    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
103        f.debug_struct("Hasher").field("hash", &self.hash).finish()
104    }
105}