summavy_stacker/
arena_hashmap.rs

1use std::{iter, mem, slice};
2
3use byteorder::{ByteOrder, NativeEndian};
4use murmurhash32::murmurhash2;
5
6use super::{Addr, MemoryArena};
7use crate::memory_arena::store;
8use crate::UnorderedId;
9
10/// Returns the actual memory size in bytes
11/// required to create a table with a given capacity.
12/// required to create a table of size
13pub fn compute_table_size(capacity: usize) -> usize {
14    capacity * mem::size_of::<KeyValue>()
15}
16
17/// `KeyValue` is the item stored in the hash table.
18/// The key is actually a `BytesRef` object stored in an external memory arena.
19/// The `value_addr` also points to an address in the memory arena.
20#[derive(Copy, Clone)]
21struct KeyValue {
22    key_value_addr: Addr,
23    hash: u32,
24    unordered_id: UnorderedId,
25}
26
27impl Default for KeyValue {
28    fn default() -> Self {
29        KeyValue {
30            key_value_addr: Addr::null_pointer(),
31            hash: 0u32,
32            unordered_id: UnorderedId::default(),
33        }
34    }
35}
36
37impl KeyValue {
38    fn is_empty(self) -> bool {
39        self.key_value_addr.is_null()
40    }
41}
42
43/// Customized `HashMap` with `&[u8]` keys
44///
45/// Its main particularity is that rather than storing its
46/// keys in the heap, keys are stored in a memory arena
47/// inline with the values.
48///
49/// The quirky API has the benefit of avoiding
50/// the computation of the hash of the key twice,
51/// or copying the key as long as there is no insert.
52pub struct ArenaHashMap {
53    table: Box<[KeyValue]>,
54    memory_arena: MemoryArena,
55    mask: usize,
56    occupied: Vec<usize>,
57    len: usize,
58}
59
60struct QuadraticProbing {
61    hash: usize,
62    i: usize,
63    mask: usize,
64}
65
66impl QuadraticProbing {
67    #[inline]
68    fn compute(hash: usize, mask: usize) -> QuadraticProbing {
69        QuadraticProbing { hash, i: 0, mask }
70    }
71
72    #[inline]
73    fn next_probe(&mut self) -> usize {
74        self.i += 1;
75        (self.hash + self.i) & self.mask
76    }
77}
78
79pub struct Iter<'a> {
80    hashmap: &'a ArenaHashMap,
81    inner: slice::Iter<'a, usize>,
82}
83
84impl<'a> Iterator for Iter<'a> {
85    type Item = (&'a [u8], Addr, UnorderedId);
86
87    fn next(&mut self) -> Option<Self::Item> {
88        self.inner.next().cloned().map(move |bucket: usize| {
89            let kv = self.hashmap.table[bucket];
90            let (key, offset): (&'a [u8], Addr) = self.hashmap.get_key_value(kv.key_value_addr);
91            (key, offset, kv.unordered_id)
92        })
93    }
94}
95
96/// Returns the greatest power of two lower or equal to `n`.
97/// Except if n == 0, in that case, return 1.
98///
99/// # Panics if n == 0
100fn compute_previous_power_of_two(n: usize) -> usize {
101    assert!(n > 0);
102    let msb = (63u32 - (n as u64).leading_zeros()) as u8;
103    1 << msb
104}
105
106impl ArenaHashMap {
107    pub fn new(table_size: usize) -> ArenaHashMap {
108        assert!(table_size > 0);
109        let table_size_power_of_2 = compute_previous_power_of_two(table_size);
110        let memory_arena = MemoryArena::default();
111        let table: Vec<KeyValue> = iter::repeat(KeyValue::default())
112            .take(table_size_power_of_2)
113            .collect();
114        ArenaHashMap {
115            table: table.into_boxed_slice(),
116            memory_arena,
117            mask: table_size_power_of_2 - 1,
118            occupied: Vec::with_capacity(table_size_power_of_2 / 2),
119            len: 0,
120        }
121    }
122
123    #[inline]
124    pub fn read<Item: Copy + 'static>(&self, addr: Addr) -> Item {
125        self.memory_arena.read(addr)
126    }
127
128    #[inline]
129    fn probe(&self, hash: u32) -> QuadraticProbing {
130        QuadraticProbing::compute(hash as usize, self.mask)
131    }
132
133    #[inline]
134    pub fn mem_usage(&self) -> usize {
135        self.table.len() * mem::size_of::<KeyValue>()
136    }
137
138    #[inline]
139    fn is_saturated(&self) -> bool {
140        self.table.len() < self.occupied.len() * 3
141    }
142
143    #[inline]
144    fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
145        let data = self.memory_arena.slice_from(addr);
146        let key_bytes_len = NativeEndian::read_u16(data) as usize;
147        let key_bytes: &[u8] = &data[2..][..key_bytes_len];
148        (key_bytes, addr.offset(2u32 + key_bytes_len as u32))
149    }
150
151    #[inline]
152    fn get_value_addr_if_key_match(&self, target_key: &[u8], addr: Addr) -> Option<Addr> {
153        let (stored_key, value_addr) = self.get_key_value(addr);
154        if stored_key == target_key {
155            Some(value_addr)
156        } else {
157            None
158        }
159    }
160
161    #[inline]
162    fn set_bucket(&mut self, hash: u32, key_value_addr: Addr, bucket: usize) -> UnorderedId {
163        self.occupied.push(bucket);
164        let unordered_id = self.len as UnorderedId;
165        self.len += 1;
166        self.table[bucket] = KeyValue {
167            key_value_addr,
168            hash,
169            unordered_id,
170        };
171        unordered_id
172    }
173
174    #[inline]
175    pub fn is_empty(&self) -> bool {
176        self.len() == 0
177    }
178
179    #[inline]
180    pub fn len(&self) -> usize {
181        self.len
182    }
183
184    #[inline]
185    pub fn iter(&self) -> Iter<'_> {
186        Iter {
187            inner: self.occupied.iter(),
188            hashmap: self,
189        }
190    }
191
192    fn resize(&mut self) {
193        let new_len = self.table.len() * 2;
194        let mask = new_len - 1;
195        self.mask = mask;
196        let new_table = vec![KeyValue::default(); new_len].into_boxed_slice();
197        let old_table = mem::replace(&mut self.table, new_table);
198        for old_pos in self.occupied.iter_mut() {
199            let key_value: KeyValue = old_table[*old_pos];
200            let mut probe = QuadraticProbing::compute(key_value.hash as usize, mask);
201            loop {
202                let bucket = probe.next_probe();
203                if self.table[bucket].is_empty() {
204                    *old_pos = bucket;
205                    self.table[bucket] = key_value;
206                    break;
207                }
208            }
209        }
210    }
211
212    /// `update` create a new entry for a given key if it does not exist
213    /// or updates the existing entry.
214    ///
215    /// The actual logic for this update is define in the `updater`
216    /// argument.
217    ///
218    /// If the key is not present, `updater` will receive `None` and
219    /// will be in charge of returning a default value.
220    /// If the key already as an associated value, then it will be passed
221    /// `Some(previous_value)`.
222    pub fn mutate_or_create<V, TMutator>(
223        &mut self,
224        key: &[u8],
225        mut updater: TMutator,
226    ) -> UnorderedId
227    where
228        V: Copy + 'static,
229        TMutator: FnMut(Option<V>) -> V,
230    {
231        if self.is_saturated() {
232            self.resize();
233        }
234        let hash = murmurhash2(key);
235        let mut probe = self.probe(hash);
236        loop {
237            let bucket = probe.next_probe();
238            let kv: KeyValue = self.table[bucket];
239            if kv.is_empty() {
240                // The key does not exist yet.
241                let val = updater(None);
242                let num_bytes = std::mem::size_of::<u16>() + key.len() + std::mem::size_of::<V>();
243                let key_addr = self.memory_arena.allocate_space(num_bytes);
244                {
245                    let data = self.memory_arena.slice_mut(key_addr, num_bytes);
246                    NativeEndian::write_u16(data, key.len() as u16);
247                    let stop = 2 + key.len();
248                    data[2..stop].copy_from_slice(key);
249                    store(&mut data[stop..], val);
250                }
251                return self.set_bucket(hash, key_addr, bucket);
252            } else if kv.hash == hash {
253                if let Some(val_addr) = self.get_value_addr_if_key_match(key, kv.key_value_addr) {
254                    let v = self.memory_arena.read(val_addr);
255                    let new_v = updater(Some(v));
256                    self.memory_arena.write_at(val_addr, new_v);
257                    return kv.unordered_id;
258                }
259            }
260        }
261    }
262}
263
264#[cfg(test)]
265mod tests {
266
267    use std::collections::HashMap;
268
269    use super::{compute_previous_power_of_two, ArenaHashMap};
270
271    #[test]
272    fn test_hash_map() {
273        let mut hash_map: ArenaHashMap = ArenaHashMap::new(1 << 18);
274        hash_map.mutate_or_create(b"abc", |opt_val: Option<u32>| {
275            assert_eq!(opt_val, None);
276            3u32
277        });
278        hash_map.mutate_or_create(b"abcd", |opt_val: Option<u32>| {
279            assert_eq!(opt_val, None);
280            4u32
281        });
282        hash_map.mutate_or_create(b"abc", |opt_val: Option<u32>| {
283            assert_eq!(opt_val, Some(3u32));
284            5u32
285        });
286        let mut vanilla_hash_map = HashMap::new();
287        let iter_values = hash_map.iter();
288        for (key, addr, _) in iter_values {
289            let val: u32 = hash_map.memory_arena.read(addr);
290            vanilla_hash_map.insert(key.to_owned(), val);
291        }
292        assert_eq!(vanilla_hash_map.len(), 2);
293    }
294
295    #[test]
296    fn test_compute_previous_power_of_two() {
297        assert_eq!(compute_previous_power_of_two(8), 8);
298        assert_eq!(compute_previous_power_of_two(9), 8);
299        assert_eq!(compute_previous_power_of_two(7), 4);
300        assert_eq!(compute_previous_power_of_two(u64::MAX as usize), 1 << 63);
301    }
302}