mycroft_support/index/
hash.rs

1//! `hash` provides a `HashIndex` over `storage` types - it only allows you to check presence or
2//! not, and if present, gets you the key.
3use std::hash::{BuildHasher, Hash, Hasher};
4use std::ops::Index;
5use std::marker::PhantomData;
6use std::collections::hash_map::RandomState;
7
8// Initial number of buckets in the hash table
9const DEFAULT_SIZE: usize = 1024;
10
11/// `CheckIndex` is a way to avoid needing to move or clone values from storage when checking for
12/// equality.
13///
14/// Its primary purpose is to allow different storage mediums to be used as the `HashIndex`'s backing.
15pub trait CheckIndex<T: ?Sized> {
16    /// `x.check_index(key, val)` is equivalent to `&x[key] = val`, were such an indexing
17    /// relationship to be definable.
18    /// It's used when actually retrieving a value involves construction, such as when using
19    /// columnar or disk based storage.
20    fn check_index(&self, key: usize, val: &T) -> bool;
21}
22
23/// If `Index` can be implemented for a type, provide the definitional interpretation from
24/// `CheckIndex`'s documentation.
25impl<T: PartialEq, D: Index<usize, Output = T>> CheckIndex<T> for D {
26    fn check_index(&self, key: usize, val: &T) -> bool {
27        &self[key] == val
28    }
29}
30
31// An individual bucket in the hash table. It needs both the hash and the key:
32// * In the case of a lookup, the key is the value we return
33// * To protect against hash collision, we need to check the store using the key when we think
34//   we've found what we're looking for
35// * If we store only the key, we will need to repeatedly re-hash potentially large values, and
36//   incur potentially expensive access times if disk is in use.
37// It keeps the whole hash because:
38// * It avoids recomputing the hash on resize
39// * Variable size buckets are a pain, and due to alignment may not even buy us anything
40#[derive(Clone, Copy, Eq, PartialEq)]
41struct Entry {
42    key: usize,
43    // Technically, hashes are u64s, but:
44    // 1.) usize will be <= u64 in size
45    // 2.) If usize < u64, then we never need the upper bits of the hash anyways because we can't
46    //   have that many buckets
47    hash: usize,
48}
49
50// An empty entry constructed via a max key and max hash. Regardless of whether you're on a 64-bit
51// or 32-bit machine, `usize::MAX` is more of anything than you can put in memory, so this
52// shouldn't ever collide with a real entry.
53const EMPTY_ENTRY: Entry = Entry {
54    key: ::std::usize::MAX,
55    hash: ::std::usize::MAX,
56};
57
58impl Entry {
59    // Convenience function in case the representation of emptiness changes
60    fn is_empty(&self) -> bool {
61        self == &EMPTY_ENTRY
62    }
63}
64
65/// A `HashIndex` is an externally-referencing `HashSet`. This means it does not store its own
66/// entries, or even pointers to its entries, but rather an abstract key used to reference those
67/// entries which can be used with a provided store at operation time.
68///
69/// This enables a `HashIndex` to be used over data structures which may not have the entry fully
70/// realized, or which do not have the structure in memory at all (e.g. column stores, lazy stores,
71/// disk stores).
72///
73/// Due to my specific use case, it differs from a default `HashSet` in two ways in the interface:
74///
75/// * On membership, the `HashIndex` will return the key for use in deduping situations.
76/// * There is no delete (thus why there are no tombstones or fancy shifting)
77/// * `insert()` may not be used for membership testing - you must test via `find()`, then
78///   `insert()` only if not present.
79pub struct HashIndex<T: ?Sized> {
80    elems: usize,
81    hash_key: RandomState,
82    table: Vec<Entry>,
83    marker: PhantomData<T>,
84}
85
86impl<T: Hash + PartialEq + ?Sized> HashIndex<T> {
87    /// Creates a fresh `HashIndex`
88    pub fn new() -> Self {
89        Self {
90            elems: 0,
91            hash_key: RandomState::new(),
92            table: vec![EMPTY_ENTRY; DEFAULT_SIZE],
93            marker: PhantomData,
94        }
95    }
96
97    // Computes the probe length of a given index, e.g. on a lookup, how many steps would be
98    // required to find it.
99    fn probe_len(&self, index: usize) -> usize {
100        debug_assert!(self.table[index] != EMPTY_ENTRY);
101        let len = self.table.len();
102        // TODO can probably make this expression better with wrapping arith
103        (((self.table[index].hash % len) + len) - index) % len
104    }
105
106    // Performs a robin-hood insertion of the entry. This means it will walk the table until:
107    // 1.) A free space is found, which it will place the entry in
108    // 2.) An entry with a shorter probe length than how far we've gone is found, in which case we
109    //   will steal its spot (from the rich) and give it to our current entry (the poor). In this
110    //   case, it will take the now displaced entry, and continue the process looking for a spot
111    //   for it.
112    // Why this is fast is beyond the scope of these comments, but it is, go read the paper.
113    fn robin_hood(&mut self, mut entry: Entry) {
114        let len = self.table.len();
115        let mut index = entry.hash % len;
116        let mut probe_len = 0;
117        loop {
118            // If we're pointed at an empty entry, insert
119            if self.table[index].is_empty() {
120                self.table[index] = entry;
121                return;
122            }
123            // If our probe len is greater than the probe len of the entry
124            // we're pointed at, swap and continue
125            let their_probe = self.probe_len(index);
126            if probe_len > their_probe {
127                ::std::mem::swap(&mut entry, &mut self.table[index]);
128                probe_len = their_probe;
129            }
130            // Step the index and check the next entry
131            index = (index + 1) % len;
132            probe_len += 1;
133
134            // Just in case our resize logic is wrong, check if we've manage to loop around to the
135            // optimal - this would indicate a full hash table, and the hash table is supposed to
136            // prevent itself from becoming overfull by resizing on insert if necessary.
137            debug_assert!({
138                let optimal_index = entry.hash % len;
139                index != optimal_index
140            })
141        }
142    }
143
144    // Checks whether adding one more element would bring us over our max load
145    // Currently max load is 0.9
146    fn needs_resize(&self) -> bool {
147        (self.elems + 1) * 10 > self.table.len() * 9
148    }
149
150    // Doubles table size
151    fn resize(&mut self) {
152        let mut table = vec![EMPTY_ENTRY; self.table.len() * 2];
153        ::std::mem::swap(&mut table, &mut self.table);
154        for entry in table.into_iter() {
155            if !entry.is_empty() {
156                self.robin_hood(entry)
157            }
158        }
159    }
160
161    /// Inserts the given key from the database, assuming that it is not already present.
162    /// if you provide a value which would already return a key via `find()`, behavior
163    /// is undefined.
164    pub fn insert(&mut self, key: usize, val: &T) {
165        if self.needs_resize() {
166            self.resize()
167        }
168        let mut hasher = self.hash_key.build_hasher();
169        val.hash(&mut hasher);
170        let hash = hasher.finish();
171        let entry = Entry {
172            key: key,
173            // See type decl for downcast justification
174            hash: hash as usize,
175        };
176        self.robin_hood(entry);
177        self.elems += 1;
178    }
179
180    /// Checks whether the `HashIndex` contains a provided value.
181    /// If it does, the result will be `Some(key)`, where `&db[key] == val`.
182    /// If it does not, the result will be `None`.
183    pub fn find<D: CheckIndex<T>>(&self, val: &T, db: &D) -> Option<usize> {
184        let mut hasher = self.hash_key.build_hasher();
185        val.hash(&mut hasher);
186        let hash = hasher.finish() as usize;
187        let len = self.table.len();
188        let mut index = hash % len;
189        let mut probe_len = 0;
190        loop {
191            if self.table[index].is_empty() {
192                return None;
193            } else if self.table[index].hash == hash {
194                if db.check_index(self.table[index].key, val) {
195                    return Some(self.table[index].key);
196                }
197            } else if probe_len > self.probe_len(index) {
198                // Would have swapped and inserted here, early termination
199                return None;
200            }
201            index = (index + 1) % len;
202            probe_len += 1;
203            // Table load should prevent this from being possible, but like in insert, let's double
204            // check
205            debug_assert!({
206                let optimal_index = hash % len;
207                index != optimal_index
208            });
209        }
210    }
211}