entropy_map/map_with_dict.rs
1//! A module providing `MapWithDict`, an immutable hash map implementation.
2//!
3//! `MapWithDict` is a hash map structure that optimizes for space by utilizing a minimal perfect
4//! hash function (MPHF) for indexing the map's keys. This enables efficient storage and retrieval,
5//! as it reduces the overall memory footprint by packing unique values into a dictionary. The MPHF
6//! provides direct access to the indices of keys, which correspond to their respective values in
7//! the values dictionary. Keys are stored to ensure that `get` operation will return `None` if key
8//! wasn't present in original set.
9
10use std::borrow::Borrow;
11use std::collections::HashMap;
12use std::hash::{Hash, Hasher};
13use std::mem::size_of_val;
14
15use num::{PrimInt, Unsigned};
16use wyhash::WyHash;
17
18use crate::mphf::{Mphf, MphfError, DEFAULT_GAMMA};
19
20/// An efficient, immutable hash map with values dictionary-packed for optimized space usage.
21#[derive(Default)]
22#[cfg_attr(feature = "rkyv_derive", derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize))]
23#[cfg_attr(feature = "rkyv_derive", archive_attr(derive(rkyv::CheckBytes)))]
24pub struct MapWithDict<K, V, const B: usize = 32, const S: usize = 8, ST = u8, H = WyHash>
25where
26 ST: PrimInt + Unsigned,
27 H: Hasher + Default,
28{
29 /// Minimally Perfect Hash Function for keys indices retrieval
30 mphf: Mphf<B, S, ST, H>,
31 /// Map keys
32 keys: Box<[K]>,
33 /// Points to the value index in the dictionary
34 values_index: Box<[usize]>,
35 /// Map unique values
36 values_dict: Box<[V]>,
37}
38
39impl<K, V, const B: usize, const S: usize, ST, H> MapWithDict<K, V, B, S, ST, H>
40where
41 K: Eq + Hash + Clone,
42 V: Eq + Clone + Hash,
43 ST: PrimInt + Unsigned,
44 H: Hasher + Default,
45{
46 /// Constructs a `MapWithDict` from an iterator of key-value pairs and MPHF function params.
47 pub fn from_iter_with_params<I>(iter: I, gamma: f32) -> Result<Self, MphfError>
48 where
49 I: IntoIterator<Item = (K, V)>,
50 {
51 let mut keys = vec![];
52 let mut values_index = vec![];
53 let mut values_dict = vec![];
54 let mut offsets_cache = HashMap::new();
55
56 for (k, v) in iter {
57 keys.push(k.clone());
58
59 if let Some(&offset) = offsets_cache.get(&v) {
60 // re-use dictionary offset if found in cache
61 values_index.push(offset);
62 } else {
63 // store current dictionary length as an offset in both index and cache
64 let offset = values_dict.len();
65 offsets_cache.insert(v.clone(), offset);
66 values_index.push(offset);
67 values_dict.push(v.clone());
68 }
69 }
70
71 let mphf = Mphf::from_slice(&keys, gamma)?;
72
73 // Re-order `keys` and `values_index` according to `mphf`
74 for i in 0..keys.len() {
75 loop {
76 let idx = mphf.get(&keys[i]).unwrap();
77 if idx == i {
78 break;
79 }
80 keys.swap(i, idx);
81 values_index.swap(i, idx);
82 }
83 }
84
85 Ok(MapWithDict {
86 mphf,
87 keys: keys.into_boxed_slice(),
88 values_index: values_index.into_boxed_slice(),
89 values_dict: values_dict.into_boxed_slice(),
90 })
91 }
92
93 /// Returns a reference to the value corresponding to the key. Returns `None` if the key is
94 /// not present in the map.
95 ///
96 /// # Examples
97 /// ```
98 /// # use std::collections::HashMap;
99 /// # use entropy_map::MapWithDict;
100 /// let map = MapWithDict::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap();
101 /// assert_eq!(map.get(&1), Some(&2));
102 /// assert_eq!(map.get(&5), None);
103 /// ```
104 #[inline]
105 pub fn get<Q>(&self, key: &Q) -> Option<&V>
106 where
107 K: Borrow<Q> + PartialEq<Q>,
108 Q: Hash + Eq + ?Sized,
109 {
110 let idx = self.mphf.get(key)?;
111
112 // SAFETY: `idx` is always within bounds (ensured during construction)
113 unsafe {
114 if self.keys.get_unchecked(idx) == key {
115 // SAFETY: `idx` and `value_idx` are always within bounds (ensure during construction)
116 let value_idx = *self.values_index.get_unchecked(idx);
117 Some(self.values_dict.get_unchecked(value_idx))
118 } else {
119 None
120 }
121 }
122 }
123
124 /// Returns the number of key-value pairs in the map.
125 ///
126 /// # Examples
127 /// ```
128 /// # use std::collections::HashMap;
129 /// # use entropy_map::MapWithDict;
130 /// let map = MapWithDict::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap();
131 /// assert_eq!(map.len(), 2);
132 /// ```
133 #[inline]
134 pub fn len(&self) -> usize {
135 self.keys.len()
136 }
137
138 /// Returns `true` if the map contains no elements.
139 ///
140 /// # Examples
141 /// ```
142 /// # use std::collections::HashMap;
143 /// # use entropy_map::MapWithDict;
144 /// let map = MapWithDict::try_from(HashMap::from([(0, 0); 0])).unwrap();
145 /// assert_eq!(map.is_empty(), true);
146 /// let map = MapWithDict::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap();
147 /// assert_eq!(map.is_empty(), false);
148 /// ```
149 #[inline]
150 pub fn is_empty(&self) -> bool {
151 self.keys.is_empty()
152 }
153
154 /// Checks if the map contains the specified key.
155 ///
156 /// # Examples
157 /// ```
158 /// # use std::collections::HashMap;
159 /// # use entropy_map::MapWithDict;
160 /// let map = MapWithDict::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap();
161 /// assert_eq!(map.contains_key(&1), true);
162 /// assert_eq!(map.contains_key(&2), false);
163 /// ```
164 #[inline]
165 pub fn contains_key<Q>(&self, key: &Q) -> bool
166 where
167 K: Borrow<Q> + PartialEq<Q>,
168 Q: Hash + Eq + ?Sized,
169 {
170 if let Some(idx) = self.mphf.get(key) {
171 // SAFETY: `idx` is always within bounds (ensured during construction)
172 unsafe { self.keys.get_unchecked(idx) == key }
173 } else {
174 false
175 }
176 }
177
178 /// Returns an iterator over the map, yielding key-value pairs.
179 ///
180 /// # Examples
181 /// ```
182 /// # use std::collections::HashMap;
183 /// # use entropy_map::MapWithDict;
184 /// let map = MapWithDict::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap();
185 /// for (key, val) in map.iter() {
186 /// println!("key: {key} val: {val}");
187 /// }
188 /// ```
189 #[inline]
190 pub fn iter(&self) -> impl Iterator<Item = (&K, &V)> {
191 self.keys
192 .iter()
193 .zip(self.values_index.iter())
194 .map(move |(key, &value_idx)| {
195 // SAFETY: `value_idx` is always within bounds (ensured during construction)
196 let value = unsafe { self.values_dict.get_unchecked(value_idx) };
197 (key, value)
198 })
199 }
200
201 /// Returns an iterator over the keys of the map.
202 ///
203 /// # Examples
204 /// ```
205 /// # use std::collections::HashMap;
206 /// # use entropy_map::MapWithDict;
207 /// let map = MapWithDict::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap();
208 /// for key in map.keys() {
209 /// println!("{key}");
210 /// }
211 /// ```
212 #[inline]
213 pub fn keys(&self) -> impl Iterator<Item = &K> {
214 self.keys.iter()
215 }
216
217 /// Returns an iterator over the values of the map.
218 ///
219 /// # Examples
220 /// ```
221 /// # use std::collections::HashMap;
222 /// # use entropy_map::MapWithDict;
223 /// let map = MapWithDict::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap();
224 /// for val in map.values() {
225 /// println!("{val}");
226 /// }
227 /// ```
228 #[inline]
229 pub fn values(&self) -> impl Iterator<Item = &V> {
230 self.values_index.iter().map(move |&value_idx| {
231 // SAFETY: `value_idx` is always within bounds (ensured during construction)
232 unsafe { self.values_dict.get_unchecked(value_idx) }
233 })
234 }
235
236 /// Returns the total number of bytes occupied by the structure.
237 ///
238 /// # Examples
239 /// ```
240 /// # use std::collections::HashMap;
241 /// # use entropy_map::MapWithDict;
242 /// let map = MapWithDict::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap();
243 /// assert_eq!(map.size(), 270);
244 /// ```
245 #[inline]
246 pub fn size(&self) -> usize {
247 size_of_val(self)
248 + self.mphf.size()
249 + size_of_val(self.keys.as_ref())
250 + size_of_val(self.values_index.as_ref())
251 + size_of_val(self.values_dict.as_ref())
252 }
253}
254
255/// Creates a `MapWithDict` from a `HashMap`.
256impl<K, V> TryFrom<HashMap<K, V>> for MapWithDict<K, V>
257where
258 K: Eq + Hash + Clone,
259 V: Eq + Clone + Hash,
260{
261 type Error = MphfError;
262
263 #[inline]
264 fn try_from(value: HashMap<K, V>) -> Result<Self, Self::Error> {
265 MapWithDict::<K, V>::from_iter_with_params(value, DEFAULT_GAMMA)
266 }
267}
268
269/// Implement `get` for `Archived` version of `MapWithDict` if feature is enabled
270#[cfg(feature = "rkyv_derive")]
271impl<K, V, const B: usize, const S: usize, ST, H> ArchivedMapWithDict<K, V, B, S, ST, H>
272where
273 K: PartialEq + Hash + rkyv::Archive,
274 K::Archived: PartialEq<K>,
275 V: rkyv::Archive,
276 ST: PrimInt + Unsigned + rkyv::Archive<Archived = ST>,
277 H: Hasher + Default,
278{
279 /// Checks if the map contains the specified key.
280 ///
281 /// # Examples
282 /// ```
283 /// # use std::collections::HashMap;
284 /// # use entropy_map::MapWithDict;
285 /// let map = MapWithDict::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap();
286 /// let archived_map = rkyv::from_bytes::<MapWithDict<u32, u32>>(
287 /// &rkyv::to_bytes::<_, 1024>(&map).unwrap()
288 /// ).unwrap();
289 /// assert_eq!(archived_map.contains_key(&1), true);
290 /// assert_eq!(archived_map.contains_key(&2), false);
291 /// ```
292 #[inline]
293 pub fn contains_key<Q: ?Sized>(&self, key: &Q) -> bool
294 where
295 K: Borrow<Q>,
296 <K as rkyv::Archive>::Archived: PartialEq<Q>,
297 Q: Hash + Eq,
298 {
299 if let Some(idx) = self.mphf.get(key) {
300 // SAFETY: `idx` is always within bounds (ensured during construction)
301 unsafe { self.keys.get_unchecked(idx) == key }
302 } else {
303 false
304 }
305 }
306
307 /// Returns a reference to the value corresponding to the key. Returns `None` if the key is
308 /// not present in the map.
309 ///
310 /// # Examples
311 /// ```
312 /// # use std::collections::HashMap;
313 /// # use entropy_map::MapWithDict;
314 /// let map = MapWithDict::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap();
315 /// let archived_map = rkyv::from_bytes::<MapWithDict<u32, u32>>(
316 /// &rkyv::to_bytes::<_, 1024>(&map).unwrap()
317 /// ).unwrap();
318 /// assert_eq!(archived_map.get(&1), Some(&2));
319 /// assert_eq!(archived_map.get(&5), None);
320 /// ```
321 #[inline]
322 pub fn get<Q: ?Sized>(&self, key: &Q) -> Option<&V::Archived>
323 where
324 K: Borrow<Q>,
325 <K as rkyv::Archive>::Archived: PartialEq<Q>,
326 Q: Hash + Eq,
327 {
328 let idx = self.mphf.get(key)?;
329
330 // SAFETY: `idx` is always within bounds (ensured during construction)
331 unsafe {
332 if self.keys.get_unchecked(idx) == key {
333 // SAFETY: `idx` and `value_idx` are always within bounds (ensure during construction)
334 let value_idx = *self.values_index.get_unchecked(idx) as usize;
335 Some(self.values_dict.get_unchecked(value_idx))
336 } else {
337 None
338 }
339 }
340 }
341
342 /// Returns an iterator over the archived map, yielding archived key-value pairs.
343 #[inline]
344 pub fn iter(&self) -> impl Iterator<Item = (&K::Archived, &V::Archived)> {
345 self.keys
346 .iter()
347 .zip(self.values_index.iter())
348 .map(move |(key, &value_idx)| {
349 // SAFETY: `value_idx` is always within bounds (ensured during construction)
350 let value = unsafe { self.values_dict.get_unchecked(value_idx as usize) };
351 (key, value)
352 })
353 }
354}
355
356#[cfg(test)]
357mod tests {
358 use super::*;
359 use paste::paste;
360 use proptest::prelude::*;
361 use rand::{Rng, SeedableRng};
362 use rand_chacha::ChaCha8Rng;
363 use std::collections::{hash_map::RandomState, HashSet};
364
365 fn gen_map(items_num: usize) -> HashMap<u64, u32> {
366 let mut rng = ChaCha8Rng::seed_from_u64(123);
367
368 (0..items_num)
369 .map(|_| {
370 let key = rng.gen::<u64>();
371 let value = rng.gen_range(1..=10);
372 (key, value)
373 })
374 .collect()
375 }
376
377 #[test]
378 fn test_map_with_dict() {
379 // Collect original key-value pairs directly into a HashMap
380 let original_map = gen_map(1000);
381
382 // Create the map from the iterator
383 let map = MapWithDict::try_from(original_map.clone()).unwrap();
384
385 // Test len
386 assert_eq!(map.len(), original_map.len());
387
388 // Test is_empty
389 assert_eq!(map.is_empty(), original_map.is_empty());
390
391 // Test get, contains_key
392 for (key, value) in &original_map {
393 assert_eq!(map.get(key), Some(value));
394 assert!(map.contains_key(key));
395 }
396
397 // Test iter
398 for (&k, &v) in map.iter() {
399 assert_eq!(original_map.get(&k), Some(&v));
400 }
401
402 // Test keys
403 for k in map.keys() {
404 assert!(original_map.contains_key(k));
405 }
406
407 // Test values
408 for &v in map.values() {
409 assert!(original_map.values().any(|&val| val == v));
410 }
411
412 // Test size
413 assert_eq!(map.size(), 16626);
414 }
415
416 /// Assert that we can call `.get()` with `K::borrow()`.
417 #[test]
418 fn test_get_borrow() {
419 let original_map = HashMap::from_iter([("a".to_string(), ()), ("b".to_string(), ())]);
420 let map = MapWithDict::try_from(original_map).unwrap();
421
422 assert_eq!(map.get("a"), Some(&()));
423 assert!(map.contains_key("a"));
424 assert_eq!(map.get("b"), Some(&()));
425 assert!(map.contains_key("b"));
426 assert_eq!(map.get("c"), None);
427 assert!(!map.contains_key("c"));
428 }
429
430 #[cfg(feature = "rkyv_derive")]
431 #[test]
432 fn test_rkyv() {
433 // create regular `HashMap`, then `MapWithDict`, then serialize to `rkyv` bytes.
434 let original_map = gen_map(1000);
435 let map = MapWithDict::try_from(original_map.clone()).unwrap();
436 let rkyv_bytes = rkyv::to_bytes::<_, 1024>(&map).unwrap();
437
438 assert_eq!(rkyv_bytes.len(), 12464);
439
440 let rkyv_map = rkyv::check_archived_root::<MapWithDict<u64, u32>>(&rkyv_bytes).unwrap();
441
442 // Test get on `Archived` version
443 for (k, v) in original_map.iter() {
444 assert_eq!(v, rkyv_map.get(k).unwrap());
445 }
446
447 // Test iter on `Archived` version
448 for (&k, &v) in rkyv_map.iter() {
449 assert_eq!(original_map.get(&k), Some(&v));
450 }
451 }
452
453 #[cfg(feature = "rkyv_derive")]
454 #[test]
455 fn test_rkyv_get_borrow() {
456 let original_map = HashMap::from_iter([("a".to_string(), ()), ("b".to_string(), ())]);
457 let map = MapWithDict::try_from(original_map).unwrap();
458 let rkyv_bytes = rkyv::to_bytes::<_, 1024>(&map).unwrap();
459 let rkyv_map = rkyv::check_archived_root::<MapWithDict<String, ()>>(&rkyv_bytes).unwrap();
460
461 assert_eq!(map.get("a"), Some(&()));
462 assert!(rkyv_map.contains_key("a"));
463 assert_eq!(map.get("b"), Some(&()));
464 assert!(rkyv_map.contains_key("b"));
465 assert_eq!(map.get("c"), None);
466 assert!(!rkyv_map.contains_key("c"));
467 }
468
469 macro_rules! proptest_map_with_dict_model {
470 ($(($b:expr, $s:expr, $gamma:expr)),* $(,)?) => {
471 $(
472 paste! {
473 proptest! {
474 #[test]
475 fn [<proptest_map_with_dict_model_ $b _ $s _ $gamma>](model: HashMap<u64, u64>, arbitrary: HashSet<u64>) {
476 let entropy_map: MapWithDict<u64, u64, $b, $s> = MapWithDict::from_iter_with_params(
477 model.clone(),
478 $gamma as f32 / 100.0
479 ).unwrap();
480
481 // Assert that length matches model.
482 assert_eq!(entropy_map.len(), model.len());
483 assert_eq!(entropy_map.is_empty(), model.is_empty());
484
485 // Assert that keys and values match model.
486 assert_eq!(
487 HashSet::<_, RandomState>::from_iter(entropy_map.keys()),
488 HashSet::from_iter(model.keys())
489 );
490 assert_eq!(
491 HashSet::<_, RandomState>::from_iter(entropy_map.values()),
492 HashSet::from_iter(model.values())
493 );
494
495 // Assert that contains and get operations match model for contained elements.
496 for (k, v) in &model {
497 assert!(entropy_map.contains_key(&k));
498 assert_eq!(entropy_map.get(&k), Some(v));
499 }
500
501 // Assert that contains and get operations match model for random elements.
502 for k in arbitrary {
503 assert_eq!(
504 model.contains_key(&k),
505 entropy_map.contains_key(&k),
506 );
507 assert_eq!(entropy_map.get(&k), model.get(&k));
508 }
509 }
510 }
511 }
512 )*
513 };
514 }
515
516 proptest_map_with_dict_model!(
517 // (1, 8, 100),
518 (2, 8, 100),
519 (4, 8, 100),
520 (7, 8, 100),
521 (8, 8, 100),
522 (15, 8, 100),
523 (16, 8, 100),
524 (23, 8, 100),
525 (24, 8, 100),
526 (31, 8, 100),
527 (32, 8, 100),
528 (33, 8, 100),
529 (48, 8, 100),
530 (53, 8, 100),
531 (61, 8, 100),
532 (63, 8, 100),
533 (64, 8, 100),
534 (32, 7, 100),
535 (32, 5, 100),
536 (32, 4, 100),
537 (32, 3, 100),
538 (32, 1, 100),
539 (32, 0, 100),
540 (32, 8, 200),
541 (32, 6, 200),
542 );
543}