Skip to main content

hash_roll/
buzhash.rs

1#![cfg(feature = "buzhash")]
2
3//! BuzHash (aka Cyclic Polynomial Hashing) is a window based rolling hash
4//!
5//! BuzHash with various chunk-splitting methods is used in:
6//!   - [Borg](https://github.com/borgbackup/borg)
7//!   - [Attic](https://github.com/jborg/attic)
8//!     - via [silvasur/buzhash](https://github.com/silvasur/buzhash)
9//!   - [attic-labs/nom](https://github.com/attic-labs/noms/blob/26620a34bc8c95812037588869d4790b5581b34d/go/types/rolling_value_hasher.go#L15-L21)
10//!
11//! Documentation:
12//!
13//! - [Recursive Hashing Functions for n-Grams, JONATHAN D. COHEN](https://www.csee.umbc.edu/courses/graduate/676/recursivehashingp291-cohen)
14//! - ["Cyclic Polynomial", Rolling Hashes, Wikipedia](https://en.wikipedia.org/wiki/Rolling_hash#cite_ref-3)
15//!
16use crate::{Chunk, ChunkIncr, ToChunkIncr};
17use std::fmt;
18use std::num::Wrapping;
19/* Cyclic polynomial (buzhash)
20 *
21 * H = s ** (k -1) (h(c_1)) ^ s**(k-2)(h(c_2)) ^ ... ^ s(h(c_(k-1))) ^ h(c_k)
22 * where s(x) is a barrel shift of x (ABCDEFG becomes BCDEFGA, where each letter is a bit)
23 * s**y(x) is application of s(n) y times.
24 *
25 * Application:
26 *
27 *  - H <- s(H)
28 *  - c_1 <- s**k(h(c_1))
29 *  - H <- s(H) ^ s**k(h(c_1)) ^ h(c_(k+1))
30 *
31 *  Where c_1 is the character to remove,
32 *      c_(k+1) is the character to add
33 *
34 * Parameters:
35 *  - k: number of inputs to contain (can be un-capped?)
36 *  - h: a hash function from inputs to integers on [2, 2**L)
37 *
38 * State:
39 *  - every input contained in the hash (if removal is required)
40 *  - previous hash result
41 */
42
43/// Describes an instance of BuzHash (aka cyclic polynomial hash).
44///
45/// Provides parameterization over the window size (`k`), hash function (`h`), chunk edge mask, and
46/// max chunk size.
47///
48/// Uses fixed 32-bit width for the hash.
49///
50/// The trait [`BuzHashHash`] provides the internal hash function, see the implimentations of it
51/// for built-in hash options (which include both `Borg` and `silvasur/buzhash`'s internal hash
52/// tables).
53///
54/// Note that it's helpful for `k` to be prime to prevent repeating strings form resulting
55/// in total cancelation of the internal hash, which can cause overly long chunks.
56///
57/// Adjusting `mask` changes the average chunk size.
58///
59/// # Performance
60///
61/// [`BuzHash`] requires storing bytes equal to it's window size (`k`). Because of this,
62/// [`BuzHashIncr`] may have poor performance compared to [`BuzHash::find_chunk_edge()`].
63#[derive(Debug, Clone, PartialEq, Eq)]
64pub struct BuzHash<H: BuzHashHash> {
65    /// number of characters to consider at once
66    k: usize,
67
68    /// A hash function over a single byte that emits a 32-bit value
69    h: H,
70
71    /// the 1 bits indicates the bit in the hash which must be 1 to form a chunk edge
72    /// (called `pattern` in `attic-labs/nom`)
73    mask: u32,
74
75    /// if the index grows _above_ this size, a chunk edge is formed
76    max_chunk_size: u64,
77}
78
79impl<H: BuzHashHash> BuzHash<H> {
80    /// Create an instance with the given capacity (k) and chunk termination `mask`, and a internal
81    /// `hash` function.
82    ///
83    /// `capacity` is the number of bytes that are taken into account for a given hash.
84    /// `mask` affects how chunk edges are determined.
85    /// `hash` is applied to each byte of input prior to mixing into the rolling hash.
86    pub fn new(capacity: usize, mask: u32, hash: H, max_chunk_size: u64) -> Self {
87        assert!(capacity > 0);
88        BuzHash {
89            k: capacity,
90            h: hash,
91            mask,
92            max_chunk_size,
93        }
94    }
95
96    // fn new_attic()
97    // fn new_bup()
98}
99
100impl<'a> BuzHash<BuzHashTableByteSaltHash<'a>> {
101    /// Create a buzhash instance using defaults from attic-labs/nom version 7.17
102    ///
103    /// - `k: 67`
104    /// - `hash` is the `silvasur/buzhash` table
105    /// - `mask: 1<<12 -1`
106    /// - `max_chunk_size: 1 << 24`
107    pub fn new_nom(salt: u8) -> Self {
108        BuzHash::new(
109            67,
110            (1 << 12u32) - 1,
111            BuzHashTableByteSaltHash::from((salt, &crate::buzhash_table::GO_BUZHASH)),
112            1 << 24,
113        )
114    }
115}
116
117impl<H: BuzHashHash + Clone> Chunk for BuzHash<H> {
118    type SearchState = BuzHashSearchState;
119
120    fn to_search_state(&self) -> Self::SearchState {
121        Self::SearchState::default()
122    }
123
124    fn find_chunk_edge(
125        &self,
126        state: &mut Self::SearchState,
127        data: &[u8],
128    ) -> (Option<usize>, usize) {
129        for i in state.offset..data.len() {
130            state.state.add_buf(data, self, i);
131
132            if (state.state.h & self.mask) == self.mask {
133                state.reset();
134                return (Some(i + 1), i + 1);
135            }
136
137            /*
138             * broken: `i` is not the number of bytes since prev chunk.
139             * need to track internal last chunk
140            if i as u64 > self.max_chunk_size {
141                state.reset();
142                println!(" <- CHUNK: {}", i + 1);
143                return (Some(i + 1), i + 1);
144            }
145            */
146        }
147
148        // keep k elements = discard all but k
149        let discard_ct = data.len().saturating_sub(self.k);
150        state.offset = data.len() - discard_ct;
151        (None, discard_ct)
152    }
153}
154
155impl<H: BuzHashHash + Clone> From<&BuzHash<H>> for BuzHashIncr<H> {
156    fn from(src: &BuzHash<H>) -> Self {
157        src.clone().into()
158    }
159}
160
161impl<H: BuzHashHash + Clone> ToChunkIncr for BuzHash<H> {
162    type Incr = BuzHashIncr<H>;
163    fn to_chunk_incr(&self) -> Self::Incr {
164        self.into()
165    }
166}
167
168#[derive(Debug, Clone, PartialEq, Eq, Default)]
169pub struct BuzHashSearchState {
170    offset: usize,
171    state: BuzHashState,
172}
173
174impl BuzHashSearchState {
175    fn reset(&mut self) {
176        self.offset = 0;
177        self.state.reset();
178    }
179}
180
181#[derive(Debug, Clone, PartialEq, Eq, Default)]
182struct BuzHashState {
183    /// current value of the hash.
184    h: u32,
185}
186
187impl BuzHashState {
188    fn reset(&mut self) {
189        self.h = 0;
190    }
191
192    fn add_buf<H: BuzHashHash>(&mut self, data: &[u8], params: &BuzHash<H>, i: usize) {
193        if i >= params.k {
194            // need to find and "remove" a entry
195            let drop_i = i - params.k;
196            let drop = data[drop_i];
197            self.add_overflow(params, data[i], drop);
198        } else {
199            // no removal
200            self.add(params, data[i]);
201        }
202    }
203
204    // insert, assuming no overflow
205    fn add<H: BuzHashHash>(&mut self, params: &BuzHash<H>, v: u8) {
206        self.h = self.h.rotate_left(1) ^ params.h.hash(v);
207    }
208
209    // insert with overflow
210    fn add_overflow<H: BuzHashHash>(&mut self, params: &BuzHash<H>, add_v: u8, remove_v: u8) {
211        let h = self.h.rotate_left(1);
212        // need to find and "remove" a entry
213        let drop = params.h.hash(remove_v).rotate_left((params.k % 8) as u32);
214        self.h = h ^ drop ^ params.h.hash(add_v);
215    }
216}
217
218/// Self-contained buzhash which buffers it's window of values internally
219///
220/// Note that this will be less efficient than using [`BuzHash`] on a slice directly,
221/// but may be more convenient.
222#[derive(Debug, Clone, PartialEq, Eq)]
223pub struct BuzHashIncr<H: BuzHashHash> {
224    params: BuzHash<H>,
225    state: BuzHashState,
226    buf: Box<[u8]>,
227    buf_idx: Wrapping<usize>,
228    input_idx: u64,
229}
230
231impl<H: BuzHashHash> ChunkIncr for BuzHashIncr<H> {
232    /// Return the index in `data` immeidately following the hash matching.
233    ///
234    /// Note that you can call this multiple times to examine "subsequent" `data` slices, but the
235    /// index returned will always refer to the current `data` slice.
236    fn push(&mut self, data: &[u8]) -> Option<usize> {
237        for (i, &v) in data.iter().enumerate() {
238            self.push_byte(v);
239            if (self.state.h & self.params.mask) == self.params.mask {
240                self.reset();
241                return Some(i + 1);
242            }
243
244            if self.input_idx > self.params.max_chunk_size {
245                self.reset();
246                return Some(i + 1);
247            }
248        }
249
250        None
251    }
252}
253
254impl<H: BuzHashHash> BuzHashIncr<H> {
255    fn reset(&mut self) {
256        self.buf_idx = Wrapping(0);
257        self.input_idx = 0;
258        self.state.reset();
259    }
260
261    fn push_byte(&mut self, val: u8) {
262        if self.input_idx >= self.params.k as u64 {
263            let o = self.buf[self.buf_idx.0];
264            self.state.add_overflow(&self.params, val, o);
265        } else {
266            self.state.add(&self.params, val);
267        }
268
269        self.buf[self.buf_idx.0] = val;
270
271        self.buf_idx += Wrapping(1);
272        self.buf_idx.0 %= self.params.k;
273        self.input_idx += 1;
274    }
275}
276
277impl<H: BuzHashHash> From<BuzHash<H>> for BuzHashIncr<H> {
278    fn from(params: BuzHash<H>) -> Self {
279        let buf = vec![0; params.k].into_boxed_slice();
280        Self {
281            params,
282            state: Default::default(),
283            buf,
284            buf_idx: Wrapping(0),
285            input_idx: 0,
286        }
287    }
288}
289
290/// The internal byte to u32 mapping used in buzhash
291pub trait BuzHashHash {
292    fn hash(&self, data: u8) -> u32;
293}
294
295/// Use a referenced table to preform the `BuzHashHash` internal hashing
296#[derive(Clone)]
297pub struct BuzHashTableHash<'a> {
298    table: &'a [u32; 256],
299}
300
301impl<'a> fmt::Debug for BuzHashTableHash<'a> {
302    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
303        fmt.debug_struct("BuzHashTableHash").finish()
304    }
305}
306
307impl<'a> From<&'a [u32; 256]> for BuzHashTableHash<'a> {
308    fn from(table: &'a [u32; 256]) -> Self {
309        Self { table }
310    }
311}
312
313impl<'a> BuzHashHash for BuzHashTableHash<'a> {
314    fn hash(&self, data: u8) -> u32 {
315        self.table[data as usize]
316    }
317}
318
319/// Use a owned table to perform the `BuzHashHash` internal hashing
320#[derive(Clone)]
321pub struct BuzHashTableBufHash {
322    table: Box<[u32; 256]>,
323}
324
325impl fmt::Debug for BuzHashTableBufHash {
326    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
327        fmt.debug_struct("BuzHashTableBufHash").finish()
328    }
329}
330
331impl<'a> From<Box<[u32; 256]>> for BuzHashTableBufHash {
332    fn from(table: Box<[u32; 256]>) -> Self {
333        Self { table }
334    }
335}
336
337impl BuzHashHash for BuzHashTableBufHash {
338    fn hash(&self, data: u8) -> u32 {
339        self.table[data as usize]
340    }
341}
342
343/// Lookup up in a table, after applying a salt via xor to the input byte
344///
345/// Used by attic-labs/nom
346#[derive(Clone)]
347pub struct BuzHashTableByteSaltHash<'a> {
348    table: &'a [u32; 256],
349    salt: u8,
350}
351
352impl<'a> fmt::Debug for BuzHashTableByteSaltHash<'a> {
353    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
354        fmt.debug_struct("BuzHashTableByteSaltHash").finish()
355    }
356}
357
358impl<'a> From<(u8, &'a [u32; 256])> for BuzHashTableByteSaltHash<'a> {
359    fn from((salt, table): (u8, &'a [u32; 256])) -> Self {
360        Self { table, salt }
361    }
362}
363
364impl<'a> BuzHashHash for BuzHashTableByteSaltHash<'a> {
365    fn hash(&self, data: u8) -> u32 {
366        self.table[(data ^ self.salt) as usize]
367    }
368}