Skip to main content

triblespace_core/patch/
bytetable.rs

1//!
2//! The number of buckets is doubled with each table growth, which is not only
3//! commonly used middle ground for growing data-structures between expensive
4//! allocation/reallocation and unused memory, but also limits the work required
5//! for rehashing as we will see shortly.
6//!
7//! The hash functions used are parameterised over the current size of the table
8//! and are what we call "compressed permutations", where the whole function is
9//! composed of two separate parametric operations
10//!
11//! hash(size) = compression(size) • permutation
12//!
13//!  * permutation: domain(hash) → [0 .. |domain|] ⊆ Nat;
14//!    reifies the randomness of the hash as a (read lossless) bijection from the
15//!    hash domain to the natural numbers
16//!  * compression: range(permutation) → range(hash);
17//!    which reduces (read lossy) the range of the permutation so that multiple
18//!    values of the hashes range are pigeonholed to the same element of its domain
19//!
20//! The compression operation we use truncates the upper (most significant) bits
21//! of the input so that it's range is equal to
22//! [0 .. |buckets|].
23//!
24//! compression(size, x) = ~(~0 << log2(size)) & x
25//!
26//! The limitation to sizes of a power of two aligns with the doubling of the
27//! hash table at each growth. In fact using the number of doublings as the parameter makes the log2 call superfluous.
28//!
29//! This compression function has an important property, as a new
30//! most significant bit is taken into consideration with each growth,
31//! each item either keeps its position or is moved to its position * 2.
32//! The only maintenance operation required to keep the hash consistent
33//! for each growth and parameter change is therefore to traverse the lower half
34//! of buckets and copy elements where neither updated hash points to their
35//! current bucket, to the corresponding bucket in the upper half.
36//! Incidentally this might flip the hash function used for this entry.
37
38use rand::seq::SliceRandom;
39use rand::thread_rng;
40use std::fmt::Debug;
41use std::sync::Once;
42
43/// The number of slots per bucket.
44const BUCKET_ENTRY_COUNT: usize = 2;
45
46/// The maximum number of slots per table.
47const MAX_SLOT_COUNT: usize = 256;
48
49/// The maximum number of cuckoo displacements attempted during
50/// insert before the size of the table is increased.
51const MAX_RETRIES: usize = 2;
52
53/// Global randomness used for bucket selection.
54static mut RANDOM_PERMUTATION_RAND: [u8; 256] = [0; 256];
55static mut RANDOM_PERMUTATION_HASH: [u8; 256] = [0; 256];
56static INIT: Once = Once::new();
57
58/// Initialise the randomness source and hash function
59/// used by all tables.
60pub fn init() {
61    INIT.call_once(|| {
62        let mut rng = thread_rng();
63        let mut bytes: [u8; 256] = [0; 256];
64
65        for (i, b) in bytes.iter_mut().enumerate() {
66            *b = i as u8;
67        }
68
69        bytes.shuffle(&mut rng);
70        unsafe {
71            RANDOM_PERMUTATION_HASH = bytes;
72        }
73
74        bytes.shuffle(&mut rng);
75        unsafe {
76            RANDOM_PERMUTATION_RAND = bytes;
77        }
78    });
79}
80
81/// Types must implement this trait in order to be storable in the byte table.
82///
83/// # Safety
84///
85/// Implementors must ensure that `key()` returns `None` iff the memory of the
86/// type is `mem::zeroed()`. Failure to uphold this contract may lead to
87/// incorrect behavior when entries are inserted into the table.
88pub unsafe trait ByteEntry {
89    /// Returns the byte key that identifies this entry's bucket.
90    fn key(&self) -> u8;
91}
92
93/// Represents the hashtable's internal buckets, which allow for up to
94/// `BUCKET_ENTRY_COUNT` elements to share the same colliding hash values.
95/// Buckets are laid out implicitly in a flat slice so bucket operations simply
96/// compute offsets into the table rather than delegating to a trait.
97///
98/// A cheap hash *cough* identity *cough* function that maps every entry to an
99/// almost linear ordering (modulo `BUCKET_ENTRY_COUNT`) when maximally grown.
100#[inline]
101fn cheap_hash(byte_key: u8) -> u8 {
102    byte_key
103}
104
105/// A hash function that uses a lookup table to provide a random bijective
106/// byte -> byte mapping.
107#[inline]
108fn rand_hash(byte_key: u8) -> u8 {
109    unsafe { RANDOM_PERMUTATION_HASH[byte_key as usize] }
110}
111
112/// Cut off the upper bits so that it fits in the bucket count.
113#[inline]
114fn compress_hash(slot_count: usize, hash: u8) -> u8 {
115    let bucket_count = (slot_count / BUCKET_ENTRY_COUNT) as u8;
116    let mask = bucket_count - 1;
117    hash & mask
118}
119
120#[derive(Clone, Copy)]
121struct ByteSet([u128; 2]);
122
123impl ByteSet {
124    fn new_empty() -> Self {
125        ByteSet([0, 0])
126    }
127
128    fn insert(&mut self, idx: u8) {
129        let bit = (idx & 0b0111_1111) as u32;
130        self.0[(idx >> 7) as usize] |= 1u128 << bit;
131    }
132
133    fn remove(&mut self, idx: u8) {
134        let bit = (idx & 0b0111_1111) as u32;
135        self.0[(idx >> 7) as usize] &= !(1u128 << bit);
136    }
137
138    fn contains(&self, idx: u8) -> bool {
139        let bit = (idx & 0b0111_1111) as u32;
140        (self.0[(idx >> 7) as usize] & (1u128 << bit)) != 0
141    }
142}
143
144fn plan_insert<T: ByteEntry + Debug>(
145    table: &mut [Option<T>],
146    bucket_idx: usize,
147    depth: usize,
148    visited: &mut ByteSet,
149) -> Option<usize> {
150    let bucket_start = bucket_idx * BUCKET_ENTRY_COUNT;
151
152    for slot_idx in 0..BUCKET_ENTRY_COUNT {
153        if table[bucket_start + slot_idx].is_none() {
154            return Some(bucket_start + slot_idx);
155        }
156    }
157
158    if depth == 0 {
159        return None;
160    }
161
162    for slot_idx in 0..BUCKET_ENTRY_COUNT {
163        let key = table[bucket_start + slot_idx]
164            .as_ref()
165            .expect("slot must be occupied")
166            .key();
167        if visited.contains(key) {
168            continue;
169        }
170        visited.insert(key);
171
172        let cheap = compress_hash(table.len(), cheap_hash(key)) as usize;
173        let rand = compress_hash(table.len(), rand_hash(key)) as usize;
174        // Try the other bucket that the key could occupy.
175        let alt_idx = if bucket_idx == cheap { rand } else { cheap };
176        if alt_idx != bucket_idx {
177            if let Some(hole_idx) = plan_insert(table, alt_idx, depth - 1, visited) {
178                table[hole_idx] = table[bucket_start + slot_idx].take();
179                visited.remove(key);
180                return Some(bucket_start + slot_idx);
181            }
182        }
183
184        visited.remove(key);
185    }
186
187    None
188}
189
190/// Operations on a cuckoo hash table indexed by single-byte keys.
191pub trait ByteTable<T: ByteEntry + Debug> {
192    /// Looks up an entry by its byte key, returning a reference if found.
193    fn table_get(&self, byte_key: u8) -> Option<&T>;
194    /// Returns a mutable reference to the slot holding `byte_key`, if present.
195    fn table_get_slot(&mut self, byte_key: u8) -> Option<&mut Option<T>>;
196    /// Inserts `entry` into the table, returning it back if the table is full.
197    fn table_insert(&mut self, entry: T) -> Option<T>;
198    /// Moves entries from `self` into `grown`, which must be twice the size.
199    fn table_grow(&mut self, grown: &mut Self);
200}
201
202impl<T: ByteEntry + Debug> ByteTable<T> for [Option<T>] {
203    fn table_get(&self, byte_key: u8) -> Option<&T> {
204        let cheap_start =
205            compress_hash(self.len(), cheap_hash(byte_key)) as usize * BUCKET_ENTRY_COUNT;
206        for slot in 0..BUCKET_ENTRY_COUNT {
207            if let Some(entry) = self[cheap_start + slot].as_ref() {
208                if entry.key() == byte_key {
209                    return Some(entry);
210                }
211            }
212        }
213
214        let rand_start =
215            compress_hash(self.len(), rand_hash(byte_key)) as usize * BUCKET_ENTRY_COUNT;
216        for slot in 0..BUCKET_ENTRY_COUNT {
217            if let Some(entry) = self[rand_start + slot].as_ref() {
218                if entry.key() == byte_key {
219                    return Some(entry);
220                }
221            }
222        }
223        None
224    }
225
226    fn table_get_slot(&mut self, byte_key: u8) -> Option<&mut Option<T>> {
227        let cheap_start =
228            compress_hash(self.len(), cheap_hash(byte_key)) as usize * BUCKET_ENTRY_COUNT;
229        for slot in 0..BUCKET_ENTRY_COUNT {
230            let idx = cheap_start + slot;
231            if let Some(entry) = self[idx].as_ref() {
232                if entry.key() == byte_key {
233                    return Some(&mut self[idx]);
234                }
235            }
236        }
237
238        let rand_start =
239            compress_hash(self.len(), rand_hash(byte_key)) as usize * BUCKET_ENTRY_COUNT;
240        for slot in 0..BUCKET_ENTRY_COUNT {
241            let idx = rand_start + slot;
242            if let Some(entry) = self[idx].as_ref() {
243                if entry.key() == byte_key {
244                    return Some(&mut self[idx]);
245                }
246            }
247        }
248        None
249    }
250
251    /// An entry with the same key must not exist in the table yet.
252    fn table_insert(&mut self, inserted: T) -> Option<T> {
253        debug_assert!(self.table_get(inserted.key()).is_none());
254
255        let mut visited = ByteSet::new_empty();
256        let key = inserted.key();
257        visited.insert(key);
258        let limit = if self.len() == MAX_SLOT_COUNT {
259            MAX_SLOT_COUNT
260        } else {
261            MAX_RETRIES
262        };
263
264        let cheap_bucket = compress_hash(self.len(), cheap_hash(key)) as usize;
265        if let Some(slot) = plan_insert(self, cheap_bucket, limit, &mut visited) {
266            self[slot] = Some(inserted);
267            return None;
268        }
269
270        let rand_bucket = compress_hash(self.len(), rand_hash(key)) as usize;
271        if let Some(slot) = plan_insert(self, rand_bucket, limit, &mut visited) {
272            self[slot] = Some(inserted);
273            return None;
274        }
275
276        Some(inserted)
277    }
278
279    fn table_grow(&mut self, grown: &mut Self) {
280        debug_assert!(self.len() * 2 == grown.len());
281        let buckets_len = self.len() / BUCKET_ENTRY_COUNT;
282        let grown_len = grown.len();
283        let (lower_portion, upper_portion) = grown.split_at_mut(self.len());
284        for bucket_index in 0..buckets_len {
285            let start = bucket_index * BUCKET_ENTRY_COUNT;
286            for slot in 0..BUCKET_ENTRY_COUNT {
287                if let Some(entry) = self[start + slot].take() {
288                    let byte_key = entry.key();
289                    let cheap_index = compress_hash(grown_len, cheap_hash(byte_key));
290                    let rand_index = compress_hash(grown_len, rand_hash(byte_key));
291
292                    let dest_bucket =
293                        if bucket_index as u8 == cheap_index || bucket_index as u8 == rand_index {
294                            &mut lower_portion[start..start + BUCKET_ENTRY_COUNT]
295                        } else {
296                            &mut upper_portion[start..start + BUCKET_ENTRY_COUNT]
297                        };
298
299                    for dest_slot in dest_bucket.iter_mut() {
300                        if dest_slot.is_none() {
301                            *dest_slot = Some(entry);
302                            break;
303                        }
304                    }
305                }
306            }
307        }
308    }
309}
310
311#[cfg(test)]
312mod tests {
313    use super::*;
314    use proptest::prelude::*;
315
316    #[derive(Copy, Clone, Debug)]
317    #[repr(C)]
318    struct DummyEntry {
319        value: u8,
320    }
321
322    impl DummyEntry {
323        fn new(byte_key: u8) -> Self {
324            DummyEntry { value: byte_key }
325        }
326    }
327
328    unsafe impl ByteEntry for DummyEntry {
329        fn key(&self) -> u8 {
330            self.value
331        }
332    }
333
334    proptest! {
335        #[test]
336        fn empty_table_then_empty_get(n in 0u8..255) {
337            init();
338            let table: [Option<DummyEntry>; 4] = [None; 4];
339            prop_assert!(table.table_get(n).is_none());
340        }
341
342        #[test]
343        fn single_insert_success(n in 0u8..255) {
344            init();
345            let mut table: [Option<DummyEntry>; 4] = [None; 4];
346            let entry = DummyEntry::new(n);
347            let displaced = table.table_insert(entry);
348            prop_assert!(displaced.is_none());
349            prop_assert!(table.table_get(n).is_some());
350        }
351
352        #[test]
353        fn insert_success(entry_set in prop::collection::hash_set(0u8..255, 1..32)) {
354            init();
355
356            let entries: Vec<_> = entry_set.iter().copied().collect();
357            let mut displaced: Option<DummyEntry> = None;
358            let mut i = 0;
359
360            macro_rules! insert_step {
361                ($table:ident, $grown_table:ident, $grown_size:expr) => {
362                    while displaced.is_none() && i < entries.len() {
363                        displaced = $table.table_insert(DummyEntry::new(entries[i]));
364                        if(displaced.is_none()) {
365                            for j in 0..=i {
366                                prop_assert!($table.table_get(entries[j]).is_some(),
367                                "Missing value {} after insert", entries[j]);
368                            }
369                        }
370                        i += 1;
371                    }
372
373                    if displaced.is_none() {return Ok(())};
374
375                    let mut $grown_table: [Option<DummyEntry>; $grown_size] = [None; $grown_size];
376                    $table.table_grow(&mut $grown_table);
377                    displaced = $grown_table.table_insert(displaced.unwrap());
378
379                    if displaced.is_none() {
380                        for j in 0..i {
381                            prop_assert!(
382                                $grown_table.table_get(entries[j]).is_some(),
383                                "Missing value {} after growth with hash {:?}",
384                                entries[j],
385                                unsafe { RANDOM_PERMUTATION_HASH }
386                            );
387                        }
388                    }
389                };
390            }
391
392            let mut table2: [Option<DummyEntry>; 2] = [None, None];
393            insert_step!(table2, table4, 4);
394            insert_step!(table4, table8, 8);
395            insert_step!(table8, table16, 16);
396            insert_step!(table16, table32, 32);
397            insert_step!(table32, table64, 64);
398            insert_step!(table64, table128, 128);
399            insert_step!(table128, table256, 256);
400
401            prop_assert!(displaced.is_none());
402        }
403    }
404
405    #[test]
406    fn sequential_insert_all_keys() {
407        init();
408        let mut table: [Option<DummyEntry>; 256] = [None; 256];
409        for n in 0u8..=255 {
410            assert!(table.table_insert(DummyEntry::new(n)).is_none());
411        }
412    }
413}