entropy_map/
map_with_dict.rs

1//! A module providing `MapWithDict`, an immutable hash map implementation.
2//!
3//! `MapWithDict` is a hash map structure that optimizes for space by utilizing a minimal perfect
4//! hash function (MPHF) for indexing the map's keys. This enables efficient storage and retrieval,
5//! as it reduces the overall memory footprint by packing unique values into a dictionary. The MPHF
6//! provides direct access to the indices of keys, which correspond to their respective values in
7//! the values dictionary. Keys are stored to ensure that `get` operation will return `None` if key
8//! wasn't present in original set.
9
10use std::borrow::Borrow;
11use std::collections::HashMap;
12use std::hash::{Hash, Hasher};
13use std::mem::size_of_val;
14
15use num::{PrimInt, Unsigned};
16use wyhash::WyHash;
17
18use crate::mphf::{Mphf, MphfError, DEFAULT_GAMMA};
19
20/// An efficient, immutable hash map with values dictionary-packed for optimized space usage.
21#[derive(Default)]
22#[cfg_attr(feature = "rkyv_derive", derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize))]
23#[cfg_attr(feature = "rkyv_derive", archive_attr(derive(rkyv::CheckBytes)))]
24pub struct MapWithDict<K, V, const B: usize = 32, const S: usize = 8, ST = u8, H = WyHash>
25where
26    ST: PrimInt + Unsigned,
27    H: Hasher + Default,
28{
29    /// Minimally Perfect Hash Function for keys indices retrieval
30    mphf: Mphf<B, S, ST, H>,
31    /// Map keys
32    keys: Box<[K]>,
33    /// Points to the value index in the dictionary
34    values_index: Box<[usize]>,
35    /// Map unique values
36    values_dict: Box<[V]>,
37}
38
39impl<K, V, const B: usize, const S: usize, ST, H> MapWithDict<K, V, B, S, ST, H>
40where
41    K: Eq + Hash + Clone,
42    V: Eq + Clone + Hash,
43    ST: PrimInt + Unsigned,
44    H: Hasher + Default,
45{
46    /// Constructs a `MapWithDict` from an iterator of key-value pairs and MPHF function params.
47    pub fn from_iter_with_params<I>(iter: I, gamma: f32) -> Result<Self, MphfError>
48    where
49        I: IntoIterator<Item = (K, V)>,
50    {
51        let mut keys = vec![];
52        let mut values_index = vec![];
53        let mut values_dict = vec![];
54        let mut offsets_cache = HashMap::new();
55
56        for (k, v) in iter {
57            keys.push(k.clone());
58
59            if let Some(&offset) = offsets_cache.get(&v) {
60                // re-use dictionary offset if found in cache
61                values_index.push(offset);
62            } else {
63                // store current dictionary length as an offset in both index and cache
64                let offset = values_dict.len();
65                offsets_cache.insert(v.clone(), offset);
66                values_index.push(offset);
67                values_dict.push(v.clone());
68            }
69        }
70
71        let mphf = Mphf::from_slice(&keys, gamma)?;
72
73        // Re-order `keys` and `values_index` according to `mphf`
74        for i in 0..keys.len() {
75            loop {
76                let idx = mphf.get(&keys[i]).unwrap();
77                if idx == i {
78                    break;
79                }
80                keys.swap(i, idx);
81                values_index.swap(i, idx);
82            }
83        }
84
85        Ok(MapWithDict {
86            mphf,
87            keys: keys.into_boxed_slice(),
88            values_index: values_index.into_boxed_slice(),
89            values_dict: values_dict.into_boxed_slice(),
90        })
91    }
92
93    /// Returns a reference to the value corresponding to the key. Returns `None` if the key is
94    /// not present in the map.
95    ///
96    /// # Examples
97    /// ```
98    /// # use std::collections::HashMap;
99    /// # use entropy_map::MapWithDict;
100    /// let map = MapWithDict::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap();
101    /// assert_eq!(map.get(&1), Some(&2));
102    /// assert_eq!(map.get(&5), None);
103    /// ```
104    #[inline]
105    pub fn get<Q>(&self, key: &Q) -> Option<&V>
106    where
107        K: Borrow<Q> + PartialEq<Q>,
108        Q: Hash + Eq + ?Sized,
109    {
110        let idx = self.mphf.get(key)?;
111
112        // SAFETY: `idx` is always within bounds (ensured during construction)
113        unsafe {
114            if self.keys.get_unchecked(idx) == key {
115                // SAFETY: `idx` and `value_idx` are always within bounds (ensure during construction)
116                let value_idx = *self.values_index.get_unchecked(idx);
117                Some(self.values_dict.get_unchecked(value_idx))
118            } else {
119                None
120            }
121        }
122    }
123
124    /// Returns the number of key-value pairs in the map.
125    ///
126    /// # Examples
127    /// ```
128    /// # use std::collections::HashMap;
129    /// # use entropy_map::MapWithDict;
130    /// let map = MapWithDict::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap();
131    /// assert_eq!(map.len(), 2);
132    /// ```
133    #[inline]
134    pub fn len(&self) -> usize {
135        self.keys.len()
136    }
137
138    /// Returns `true` if the map contains no elements.
139    ///
140    /// # Examples
141    /// ```
142    /// # use std::collections::HashMap;
143    /// # use entropy_map::MapWithDict;
144    /// let map = MapWithDict::try_from(HashMap::from([(0, 0); 0])).unwrap();
145    /// assert_eq!(map.is_empty(), true);
146    /// let map = MapWithDict::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap();
147    /// assert_eq!(map.is_empty(), false);
148    /// ```
149    #[inline]
150    pub fn is_empty(&self) -> bool {
151        self.keys.is_empty()
152    }
153
154    /// Checks if the map contains the specified key.
155    ///
156    /// # Examples
157    /// ```
158    /// # use std::collections::HashMap;
159    /// # use entropy_map::MapWithDict;
160    /// let map = MapWithDict::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap();
161    /// assert_eq!(map.contains_key(&1), true);
162    /// assert_eq!(map.contains_key(&2), false);
163    /// ```
164    #[inline]
165    pub fn contains_key<Q>(&self, key: &Q) -> bool
166    where
167        K: Borrow<Q> + PartialEq<Q>,
168        Q: Hash + Eq + ?Sized,
169    {
170        if let Some(idx) = self.mphf.get(key) {
171            // SAFETY: `idx` is always within bounds (ensured during construction)
172            unsafe { self.keys.get_unchecked(idx) == key }
173        } else {
174            false
175        }
176    }
177
178    /// Returns an iterator over the map, yielding key-value pairs.
179    ///
180    /// # Examples
181    /// ```
182    /// # use std::collections::HashMap;
183    /// # use entropy_map::MapWithDict;
184    /// let map = MapWithDict::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap();
185    /// for (key, val) in map.iter() {
186    ///     println!("key: {key} val: {val}");
187    /// }
188    /// ```
189    #[inline]
190    pub fn iter(&self) -> impl Iterator<Item = (&K, &V)> {
191        self.keys
192            .iter()
193            .zip(self.values_index.iter())
194            .map(move |(key, &value_idx)| {
195                // SAFETY: `value_idx` is always within bounds (ensured during construction)
196                let value = unsafe { self.values_dict.get_unchecked(value_idx) };
197                (key, value)
198            })
199    }
200
201    /// Returns an iterator over the keys of the map.
202    ///
203    /// # Examples
204    /// ```
205    /// # use std::collections::HashMap;
206    /// # use entropy_map::MapWithDict;
207    /// let map = MapWithDict::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap();
208    /// for key in map.keys() {
209    ///     println!("{key}");
210    /// }
211    /// ```
212    #[inline]
213    pub fn keys(&self) -> impl Iterator<Item = &K> {
214        self.keys.iter()
215    }
216
217    /// Returns an iterator over the values of the map.
218    ///
219    /// # Examples
220    /// ```
221    /// # use std::collections::HashMap;
222    /// # use entropy_map::MapWithDict;
223    /// let map = MapWithDict::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap();
224    /// for val in map.values() {
225    ///     println!("{val}");
226    /// }
227    /// ```
228    #[inline]
229    pub fn values(&self) -> impl Iterator<Item = &V> {
230        self.values_index.iter().map(move |&value_idx| {
231            // SAFETY: `value_idx` is always within bounds (ensured during construction)
232            unsafe { self.values_dict.get_unchecked(value_idx) }
233        })
234    }
235
236    /// Returns the total number of bytes occupied by the structure.
237    ///
238    /// # Examples
239    /// ```
240    /// # use std::collections::HashMap;
241    /// # use entropy_map::MapWithDict;
242    /// let map = MapWithDict::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap();
243    /// assert_eq!(map.size(), 270);
244    /// ```
245    #[inline]
246    pub fn size(&self) -> usize {
247        size_of_val(self)
248            + self.mphf.size()
249            + size_of_val(self.keys.as_ref())
250            + size_of_val(self.values_index.as_ref())
251            + size_of_val(self.values_dict.as_ref())
252    }
253}
254
255/// Creates a `MapWithDict` from a `HashMap`.
256impl<K, V> TryFrom<HashMap<K, V>> for MapWithDict<K, V>
257where
258    K: Eq + Hash + Clone,
259    V: Eq + Clone + Hash,
260{
261    type Error = MphfError;
262
263    #[inline]
264    fn try_from(value: HashMap<K, V>) -> Result<Self, Self::Error> {
265        MapWithDict::<K, V>::from_iter_with_params(value, DEFAULT_GAMMA)
266    }
267}
268
269/// Implement `get` for `Archived` version of `MapWithDict` if feature is enabled
270#[cfg(feature = "rkyv_derive")]
271impl<K, V, const B: usize, const S: usize, ST, H> ArchivedMapWithDict<K, V, B, S, ST, H>
272where
273    K: PartialEq + Hash + rkyv::Archive,
274    K::Archived: PartialEq<K>,
275    V: rkyv::Archive,
276    ST: PrimInt + Unsigned + rkyv::Archive<Archived = ST>,
277    H: Hasher + Default,
278{
279    /// Checks if the map contains the specified key.
280    ///
281    /// # Examples
282    /// ```
283    /// # use std::collections::HashMap;
284    /// # use entropy_map::MapWithDict;
285    /// let map = MapWithDict::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap();
286    /// let archived_map = rkyv::from_bytes::<MapWithDict<u32, u32>>(
287    ///     &rkyv::to_bytes::<_, 1024>(&map).unwrap()
288    /// ).unwrap();
289    /// assert_eq!(archived_map.contains_key(&1), true);
290    /// assert_eq!(archived_map.contains_key(&2), false);
291    /// ```
292    #[inline]
293    pub fn contains_key<Q: ?Sized>(&self, key: &Q) -> bool
294    where
295        K: Borrow<Q>,
296        <K as rkyv::Archive>::Archived: PartialEq<Q>,
297        Q: Hash + Eq,
298    {
299        if let Some(idx) = self.mphf.get(key) {
300            // SAFETY: `idx` is always within bounds (ensured during construction)
301            unsafe { self.keys.get_unchecked(idx) == key }
302        } else {
303            false
304        }
305    }
306
307    /// Returns a reference to the value corresponding to the key. Returns `None` if the key is
308    /// not present in the map.
309    ///
310    /// # Examples
311    /// ```
312    /// # use std::collections::HashMap;
313    /// # use entropy_map::MapWithDict;
314    /// let map = MapWithDict::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap();
315    /// let archived_map = rkyv::from_bytes::<MapWithDict<u32, u32>>(
316    ///     &rkyv::to_bytes::<_, 1024>(&map).unwrap()
317    /// ).unwrap();
318    /// assert_eq!(archived_map.get(&1), Some(&2));
319    /// assert_eq!(archived_map.get(&5), None);
320    /// ```
321    #[inline]
322    pub fn get<Q: ?Sized>(&self, key: &Q) -> Option<&V::Archived>
323    where
324        K: Borrow<Q>,
325        <K as rkyv::Archive>::Archived: PartialEq<Q>,
326        Q: Hash + Eq,
327    {
328        let idx = self.mphf.get(key)?;
329
330        // SAFETY: `idx` is always within bounds (ensured during construction)
331        unsafe {
332            if self.keys.get_unchecked(idx) == key {
333                // SAFETY: `idx` and `value_idx` are always within bounds (ensure during construction)
334                let value_idx = *self.values_index.get_unchecked(idx) as usize;
335                Some(self.values_dict.get_unchecked(value_idx))
336            } else {
337                None
338            }
339        }
340    }
341
342    /// Returns an iterator over the archived map, yielding archived key-value pairs.
343    #[inline]
344    pub fn iter(&self) -> impl Iterator<Item = (&K::Archived, &V::Archived)> {
345        self.keys
346            .iter()
347            .zip(self.values_index.iter())
348            .map(move |(key, &value_idx)| {
349                // SAFETY: `value_idx` is always within bounds (ensured during construction)
350                let value = unsafe { self.values_dict.get_unchecked(value_idx as usize) };
351                (key, value)
352            })
353    }
354}
355
356#[cfg(test)]
357mod tests {
358    use super::*;
359    use paste::paste;
360    use proptest::prelude::*;
361    use rand::{Rng, SeedableRng};
362    use rand_chacha::ChaCha8Rng;
363    use std::collections::{hash_map::RandomState, HashSet};
364
365    fn gen_map(items_num: usize) -> HashMap<u64, u32> {
366        let mut rng = ChaCha8Rng::seed_from_u64(123);
367
368        (0..items_num)
369            .map(|_| {
370                let key = rng.gen::<u64>();
371                let value = rng.gen_range(1..=10);
372                (key, value)
373            })
374            .collect()
375    }
376
377    #[test]
378    fn test_map_with_dict() {
379        // Collect original key-value pairs directly into a HashMap
380        let original_map = gen_map(1000);
381
382        // Create the map from the iterator
383        let map = MapWithDict::try_from(original_map.clone()).unwrap();
384
385        // Test len
386        assert_eq!(map.len(), original_map.len());
387
388        // Test is_empty
389        assert_eq!(map.is_empty(), original_map.is_empty());
390
391        // Test get, contains_key
392        for (key, value) in &original_map {
393            assert_eq!(map.get(key), Some(value));
394            assert!(map.contains_key(key));
395        }
396
397        // Test iter
398        for (&k, &v) in map.iter() {
399            assert_eq!(original_map.get(&k), Some(&v));
400        }
401
402        // Test keys
403        for k in map.keys() {
404            assert!(original_map.contains_key(k));
405        }
406
407        // Test values
408        for &v in map.values() {
409            assert!(original_map.values().any(|&val| val == v));
410        }
411
412        // Test size
413        assert_eq!(map.size(), 16626);
414    }
415
416    /// Assert that we can call `.get()` with `K::borrow()`.
417    #[test]
418    fn test_get_borrow() {
419        let original_map = HashMap::from_iter([("a".to_string(), ()), ("b".to_string(), ())]);
420        let map = MapWithDict::try_from(original_map).unwrap();
421
422        assert_eq!(map.get("a"), Some(&()));
423        assert!(map.contains_key("a"));
424        assert_eq!(map.get("b"), Some(&()));
425        assert!(map.contains_key("b"));
426        assert_eq!(map.get("c"), None);
427        assert!(!map.contains_key("c"));
428    }
429
430    #[cfg(feature = "rkyv_derive")]
431    #[test]
432    fn test_rkyv() {
433        // create regular `HashMap`, then `MapWithDict`, then serialize to `rkyv` bytes.
434        let original_map = gen_map(1000);
435        let map = MapWithDict::try_from(original_map.clone()).unwrap();
436        let rkyv_bytes = rkyv::to_bytes::<_, 1024>(&map).unwrap();
437
438        assert_eq!(rkyv_bytes.len(), 12464);
439
440        let rkyv_map = rkyv::check_archived_root::<MapWithDict<u64, u32>>(&rkyv_bytes).unwrap();
441
442        // Test get on `Archived` version
443        for (k, v) in original_map.iter() {
444            assert_eq!(v, rkyv_map.get(k).unwrap());
445        }
446
447        // Test iter on `Archived` version
448        for (&k, &v) in rkyv_map.iter() {
449            assert_eq!(original_map.get(&k), Some(&v));
450        }
451    }
452
453    #[cfg(feature = "rkyv_derive")]
454    #[test]
455    fn test_rkyv_get_borrow() {
456        let original_map = HashMap::from_iter([("a".to_string(), ()), ("b".to_string(), ())]);
457        let map = MapWithDict::try_from(original_map).unwrap();
458        let rkyv_bytes = rkyv::to_bytes::<_, 1024>(&map).unwrap();
459        let rkyv_map = rkyv::check_archived_root::<MapWithDict<String, ()>>(&rkyv_bytes).unwrap();
460
461        assert_eq!(map.get("a"), Some(&()));
462        assert!(rkyv_map.contains_key("a"));
463        assert_eq!(map.get("b"), Some(&()));
464        assert!(rkyv_map.contains_key("b"));
465        assert_eq!(map.get("c"), None);
466        assert!(!rkyv_map.contains_key("c"));
467    }
468
469    macro_rules! proptest_map_with_dict_model {
470        ($(($b:expr, $s:expr, $gamma:expr)),* $(,)?) => {
471            $(
472                paste! {
473                    proptest! {
474                        #[test]
475                        fn [<proptest_map_with_dict_model_ $b _ $s _ $gamma>](model: HashMap<u64, u64>, arbitrary: HashSet<u64>) {
476                            let entropy_map: MapWithDict<u64, u64, $b, $s> = MapWithDict::from_iter_with_params(
477                                model.clone(),
478                                $gamma as f32 / 100.0
479                            ).unwrap();
480
481                            // Assert that length matches model.
482                            assert_eq!(entropy_map.len(), model.len());
483                            assert_eq!(entropy_map.is_empty(), model.is_empty());
484
485                            // Assert that keys and values match model.
486                            assert_eq!(
487                                HashSet::<_, RandomState>::from_iter(entropy_map.keys()),
488                                HashSet::from_iter(model.keys())
489                            );
490                            assert_eq!(
491                                HashSet::<_, RandomState>::from_iter(entropy_map.values()),
492                                HashSet::from_iter(model.values())
493                            );
494
495                            // Assert that contains and get operations match model for contained elements.
496                            for (k, v) in &model {
497                                assert!(entropy_map.contains_key(&k));
498                                assert_eq!(entropy_map.get(&k), Some(v));
499                            }
500
501                            // Assert that contains and get operations match model for random elements.
502                            for k in arbitrary {
503                                assert_eq!(
504                                    model.contains_key(&k),
505                                    entropy_map.contains_key(&k),
506                                );
507                                assert_eq!(entropy_map.get(&k), model.get(&k));
508                            }
509                        }
510                    }
511                }
512            )*
513        };
514    }
515
516    proptest_map_with_dict_model!(
517        // (1, 8, 100),
518        (2, 8, 100),
519        (4, 8, 100),
520        (7, 8, 100),
521        (8, 8, 100),
522        (15, 8, 100),
523        (16, 8, 100),
524        (23, 8, 100),
525        (24, 8, 100),
526        (31, 8, 100),
527        (32, 8, 100),
528        (33, 8, 100),
529        (48, 8, 100),
530        (53, 8, 100),
531        (61, 8, 100),
532        (63, 8, 100),
533        (64, 8, 100),
534        (32, 7, 100),
535        (32, 5, 100),
536        (32, 4, 100),
537        (32, 3, 100),
538        (32, 1, 100),
539        (32, 0, 100),
540        (32, 8, 200),
541        (32, 6, 200),
542    );
543}