mycroft_support/index/hash.rs
1//! `hash` provides a `HashIndex` over `storage` types - it only allows you to check presence or
2//! not, and if present, gets you the key.
3use std::hash::{BuildHasher, Hash, Hasher};
4use std::ops::Index;
5use std::marker::PhantomData;
6use std::collections::hash_map::RandomState;
7
8// Initial number of buckets in the hash table
9const DEFAULT_SIZE: usize = 1024;
10
11/// `CheckIndex` is a way to avoid needing to move or clone values from storage when checking for
12/// equality.
13///
14/// Its primary purpose is to allow different storage mediums to be used as the `HashIndex`'s backing.
15pub trait CheckIndex<T: ?Sized> {
16 /// `x.check_index(key, val)` is equivalent to `&x[key] = val`, were such an indexing
17 /// relationship to be definable.
18 /// It's used when actually retrieving a value involves construction, such as when using
19 /// columnar or disk based storage.
20 fn check_index(&self, key: usize, val: &T) -> bool;
21}
22
23/// If `Index` can be implemented for a type, provide the definitional interpretation from
24/// `CheckIndex`'s documentation.
25impl<T: PartialEq, D: Index<usize, Output = T>> CheckIndex<T> for D {
26 fn check_index(&self, key: usize, val: &T) -> bool {
27 &self[key] == val
28 }
29}
30
31// An individual bucket in the hash table. It needs both the hash and the key:
32// * In the case of a lookup, the key is the value we return
33// * To protect against hash collision, we need to check the store using the key when we think
34// we've found what we're looking for
35// * If we store only the key, we will need to repeatedly re-hash potentially large values, and
36// incur potentially expensive access times if disk is in use.
37// It keeps the whole hash because:
38// * It avoids recomputing the hash on resize
39// * Variable size buckets are a pain, and due to alignment may not even buy us anything
40#[derive(Clone, Copy, Eq, PartialEq)]
41struct Entry {
42 key: usize,
43 // Technically, hashes are u64s, but:
44 // 1.) usize will be <= u64 in size
45 // 2.) If usize < u64, then we never need the upper bits of the hash anyways because we can't
46 // have that many buckets
47 hash: usize,
48}
49
50// An empty entry constructed via a max key and max hash. Regardless of whether you're on a 64-bit
51// or 32-bit machine, `usize::MAX` is more of anything than you can put in memory, so this
52// shouldn't ever collide with a real entry.
53const EMPTY_ENTRY: Entry = Entry {
54 key: ::std::usize::MAX,
55 hash: ::std::usize::MAX,
56};
57
58impl Entry {
59 // Convenience function in case the representation of emptiness changes
60 fn is_empty(&self) -> bool {
61 self == &EMPTY_ENTRY
62 }
63}
64
65/// A `HashIndex` is an externally-referencing `HashSet`. This means it does not store its own
66/// entries, or even pointers to its entries, but rather an abstract key used to reference those
67/// entries which can be used with a provided store at operation time.
68///
69/// This enables a `HashIndex` to be used over data structures which may not have the entry fully
70/// realized, or which do not have the structure in memory at all (e.g. column stores, lazy stores,
71/// disk stores).
72///
73/// Due to my specific use case, it differs from a default `HashSet` in two ways in the interface:
74///
75/// * On membership, the `HashIndex` will return the key for use in deduping situations.
76/// * There is no delete (thus why there are no tombstones or fancy shifting)
77/// * `insert()` may not be used for membership testing - you must test via `find()`, then
78/// `insert()` only if not present.
79pub struct HashIndex<T: ?Sized> {
80 elems: usize,
81 hash_key: RandomState,
82 table: Vec<Entry>,
83 marker: PhantomData<T>,
84}
85
86impl<T: Hash + PartialEq + ?Sized> HashIndex<T> {
87 /// Creates a fresh `HashIndex`
88 pub fn new() -> Self {
89 Self {
90 elems: 0,
91 hash_key: RandomState::new(),
92 table: vec![EMPTY_ENTRY; DEFAULT_SIZE],
93 marker: PhantomData,
94 }
95 }
96
97 // Computes the probe length of a given index, e.g. on a lookup, how many steps would be
98 // required to find it.
99 fn probe_len(&self, index: usize) -> usize {
100 debug_assert!(self.table[index] != EMPTY_ENTRY);
101 let len = self.table.len();
102 // TODO can probably make this expression better with wrapping arith
103 (((self.table[index].hash % len) + len) - index) % len
104 }
105
106 // Performs a robin-hood insertion of the entry. This means it will walk the table until:
107 // 1.) A free space is found, which it will place the entry in
108 // 2.) An entry with a shorter probe length than how far we've gone is found, in which case we
109 // will steal its spot (from the rich) and give it to our current entry (the poor). In this
110 // case, it will take the now displaced entry, and continue the process looking for a spot
111 // for it.
112 // Why this is fast is beyond the scope of these comments, but it is, go read the paper.
113 fn robin_hood(&mut self, mut entry: Entry) {
114 let len = self.table.len();
115 let mut index = entry.hash % len;
116 let mut probe_len = 0;
117 loop {
118 // If we're pointed at an empty entry, insert
119 if self.table[index].is_empty() {
120 self.table[index] = entry;
121 return;
122 }
123 // If our probe len is greater than the probe len of the entry
124 // we're pointed at, swap and continue
125 let their_probe = self.probe_len(index);
126 if probe_len > their_probe {
127 ::std::mem::swap(&mut entry, &mut self.table[index]);
128 probe_len = their_probe;
129 }
130 // Step the index and check the next entry
131 index = (index + 1) % len;
132 probe_len += 1;
133
134 // Just in case our resize logic is wrong, check if we've manage to loop around to the
135 // optimal - this would indicate a full hash table, and the hash table is supposed to
136 // prevent itself from becoming overfull by resizing on insert if necessary.
137 debug_assert!({
138 let optimal_index = entry.hash % len;
139 index != optimal_index
140 })
141 }
142 }
143
144 // Checks whether adding one more element would bring us over our max load
145 // Currently max load is 0.9
146 fn needs_resize(&self) -> bool {
147 (self.elems + 1) * 10 > self.table.len() * 9
148 }
149
150 // Doubles table size
151 fn resize(&mut self) {
152 let mut table = vec![EMPTY_ENTRY; self.table.len() * 2];
153 ::std::mem::swap(&mut table, &mut self.table);
154 for entry in table.into_iter() {
155 if !entry.is_empty() {
156 self.robin_hood(entry)
157 }
158 }
159 }
160
161 /// Inserts the given key from the database, assuming that it is not already present.
162 /// if you provide a value which would already return a key via `find()`, behavior
163 /// is undefined.
164 pub fn insert(&mut self, key: usize, val: &T) {
165 if self.needs_resize() {
166 self.resize()
167 }
168 let mut hasher = self.hash_key.build_hasher();
169 val.hash(&mut hasher);
170 let hash = hasher.finish();
171 let entry = Entry {
172 key: key,
173 // See type decl for downcast justification
174 hash: hash as usize,
175 };
176 self.robin_hood(entry);
177 self.elems += 1;
178 }
179
180 /// Checks whether the `HashIndex` contains a provided value.
181 /// If it does, the result will be `Some(key)`, where `&db[key] == val`.
182 /// If it does not, the result will be `None`.
183 pub fn find<D: CheckIndex<T>>(&self, val: &T, db: &D) -> Option<usize> {
184 let mut hasher = self.hash_key.build_hasher();
185 val.hash(&mut hasher);
186 let hash = hasher.finish() as usize;
187 let len = self.table.len();
188 let mut index = hash % len;
189 let mut probe_len = 0;
190 loop {
191 if self.table[index].is_empty() {
192 return None;
193 } else if self.table[index].hash == hash {
194 if db.check_index(self.table[index].key, val) {
195 return Some(self.table[index].key);
196 }
197 } else if probe_len > self.probe_len(index) {
198 // Would have swapped and inserted here, early termination
199 return None;
200 }
201 index = (index + 1) % len;
202 probe_len += 1;
203 // Table load should prevent this from being possible, but like in insert, let's double
204 // check
205 debug_assert!({
206 let optimal_index = hash % len;
207 index != optimal_index
208 });
209 }
210 }
211}