boomphf_patched/
hashmap.rs

1//! HashMap data structures, using MPHFs to encode the position of each key in a dense array.
2
3#[cfg(feature = "serde")]
4use serde::{self, Deserialize, Serialize};
5
6use crate::Mphf;
7use std::borrow::Borrow;
8use std::fmt::Debug;
9use std::hash::Hash;
10use std::iter::ExactSizeIterator;
11
12/// A HashMap data structure where the mapping between keys and values is encoded in a Mphf. This lets us store the keys and values in dense
13/// arrays, with ~3 bits/item overhead in the Mphf.
14#[derive(Debug, Clone)]
15#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
16pub struct BoomHashMap<K: Hash, D> {
17    mphf: Mphf<K>,
18    pub(crate) keys: Vec<K>,
19    pub(crate) values: Vec<D>,
20}
21
22impl<K, D> BoomHashMap<K, D>
23where
24    K: Hash + Debug + PartialEq,
25    D: Debug,
26{
27    fn create_map(mut keys: Vec<K>, mut values: Vec<D>, mphf: Mphf<K>) -> BoomHashMap<K, D> {
28        // reorder the keys and values according to the Mphf
29        for i in 0..keys.len() {
30            loop {
31                let kmer_slot = mphf.hash(&keys[i]) as usize;
32                if i == kmer_slot {
33                    break;
34                }
35                keys.swap(i, kmer_slot);
36                values.swap(i, kmer_slot);
37            }
38        }
39        BoomHashMap { mphf, keys, values }
40    }
41
42    /// Create a new hash map from the parallel array `keys` and `values`
43    pub fn new(keys: Vec<K>, data: Vec<D>) -> BoomHashMap<K, D> {
44        let mphf = Mphf::new(1.7, &keys);
45        Self::create_map(keys, data, mphf)
46    }
47
48    /// Get the value associated with `key`, if available, otherwise return None
49    pub fn get<Q: ?Sized>(&self, kmer: &Q) -> Option<&D>
50    where
51        K: Borrow<Q>,
52        Q: Hash + Eq,
53    {
54        let maybe_pos = self.mphf.try_hash(kmer);
55        match maybe_pos {
56            Some(pos) => {
57                let hashed_kmer = &self.keys[pos as usize];
58                if kmer == hashed_kmer.borrow() {
59                    Some(&self.values[pos as usize])
60                } else {
61                    None
62                }
63            }
64            None => None,
65        }
66    }
67
68    /// Get the position in the Mphf of a key, if the key exists.
69    pub fn get_key_id<Q: ?Sized>(&self, kmer: &Q) -> Option<usize>
70    where
71        K: Borrow<Q>,
72        Q: Hash + Eq,
73    {
74        let maybe_pos = self.mphf.try_hash(kmer);
75        match maybe_pos {
76            Some(pos) => {
77                let hashed_kmer = &self.keys[pos as usize];
78                if kmer == hashed_kmer.borrow() {
79                    Some(pos as usize)
80                } else {
81                    None
82                }
83            }
84            None => None,
85        }
86    }
87
88    /// Total number of key/value pairs
89    pub fn len(&self) -> usize {
90        self.keys.len()
91    }
92
93    pub fn is_empty(&self) -> bool {
94        self.keys.is_empty()
95    }
96
97    pub fn get_key(&self, id: usize) -> Option<&K> {
98        let max_key_id = self.len();
99        if id > max_key_id {
100            None
101        } else {
102            Some(&self.keys[id])
103        }
104    }
105
106    pub fn iter(&self) -> BoomIterator<K, D> {
107        BoomIterator {
108            hash: self,
109            index: 0,
110        }
111    }
112}
113
114impl<K, D> core::iter::FromIterator<(K, D)> for BoomHashMap<K, D>
115where
116    K: Hash + Debug + PartialEq,
117    D: Debug,
118{
119    fn from_iter<I: IntoIterator<Item = (K, D)>>(iter: I) -> Self {
120        let mut keys = Vec::new();
121        let mut values = Vec::new();
122
123        for (k, v) in iter {
124            keys.push(k);
125            values.push(v);
126        }
127        Self::new(keys, values)
128    }
129}
130
131impl<K, D> BoomHashMap<K, D>
132where
133    K: Hash + Debug + PartialEq + Send + Sync,
134    D: Debug,
135{
136    /// Create a new hash map from the parallel array `keys` and `values`, using a parallelized method to construct the Mphf.
137    pub fn new_parallel(keys: Vec<K>, data: Vec<D>) -> BoomHashMap<K, D> {
138        let mphf = Mphf::new_parallel(1.7, &keys, None);
139        Self::create_map(keys, data, mphf)
140    }
141}
142
143/// Iterate over key-value pairs in a BoomHashMap
144pub struct BoomIterator<'a, K: Hash + 'a, D: 'a> {
145    hash: &'a BoomHashMap<K, D>,
146    index: usize,
147}
148
149impl<'a, K: Hash, D> Iterator for BoomIterator<'a, K, D> {
150    type Item = (&'a K, &'a D);
151
152    fn next(&mut self) -> Option<Self::Item> {
153        if self.index == self.hash.keys.len() {
154            return None;
155        }
156
157        let elements = Some((&self.hash.keys[self.index], &self.hash.values[self.index]));
158        self.index += 1;
159
160        elements
161    }
162
163    fn size_hint(&self) -> (usize, Option<usize>) {
164        let remaining = self.hash.keys.len() - self.index;
165        (remaining, Some(remaining))
166    }
167}
168
169impl<'a, K: Hash, D1> ExactSizeIterator for BoomIterator<'a, K, D1> {}
170
171impl<'a, K: Hash, D> IntoIterator for &'a BoomHashMap<K, D> {
172    type Item = (&'a K, &'a D);
173    type IntoIter = BoomIterator<'a, K, D>;
174
175    fn into_iter(self) -> BoomIterator<'a, K, D> {
176        BoomIterator {
177            hash: self,
178            index: 0,
179        }
180    }
181}
182
183/// A HashMap data structure where the mapping between keys and 2 values is encoded in a Mphf. You should usually use `BoomHashMap` with a tuple/struct value type.
184/// If the layout overhead of the struct / tuple must be avoided, this variant of is an alternative.
185/// This lets us store the keys and values in dense
186/// arrays, with ~3 bits/item overhead in the Mphf.
187#[derive(Debug, Clone)]
188#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
189pub struct BoomHashMap2<K: Hash, D1, D2> {
190    mphf: Mphf<K>,
191    keys: Vec<K>,
192    values: Vec<D1>,
193    aux_values: Vec<D2>,
194}
195
196pub struct Boom2Iterator<'a, K: Hash + 'a, D1: 'a, D2: 'a> {
197    hash: &'a BoomHashMap2<K, D1, D2>,
198    index: usize,
199}
200
201impl<'a, K: Hash, D1, D2> Iterator for Boom2Iterator<'a, K, D1, D2> {
202    type Item = (&'a K, &'a D1, &'a D2);
203
204    fn next(&mut self) -> Option<Self::Item> {
205        if self.index == self.hash.keys.len() {
206            return None;
207        }
208
209        let elements = Some((
210            &self.hash.keys[self.index],
211            &self.hash.values[self.index],
212            &self.hash.aux_values[self.index],
213        ));
214        self.index += 1;
215        elements
216    }
217
218    fn size_hint(&self) -> (usize, Option<usize>) {
219        let remaining = self.hash.keys.len() - self.index;
220        (remaining, Some(remaining))
221    }
222}
223
224impl<'a, K: Hash, D1, D2> ExactSizeIterator for Boom2Iterator<'a, K, D1, D2> {}
225
226impl<'a, K: Hash, D1, D2> IntoIterator for &'a BoomHashMap2<K, D1, D2> {
227    type Item = (&'a K, &'a D1, &'a D2);
228    type IntoIter = Boom2Iterator<'a, K, D1, D2>;
229
230    fn into_iter(self) -> Boom2Iterator<'a, K, D1, D2> {
231        Boom2Iterator {
232            hash: self,
233            index: 0,
234        }
235    }
236}
237
238impl<K, D1, D2> BoomHashMap2<K, D1, D2>
239where
240    K: Hash + Debug + PartialEq,
241    D1: Debug,
242    D2: Debug,
243{
244    fn create_map(
245        mut keys: Vec<K>,
246        mut values: Vec<D1>,
247        mut aux_values: Vec<D2>,
248        mphf: Mphf<K>,
249    ) -> BoomHashMap2<K, D1, D2> {
250        // reorder the keys and values according to the Mphf
251        for i in 0..keys.len() {
252            loop {
253                let kmer_slot = mphf.hash(&keys[i]) as usize;
254                if i == kmer_slot {
255                    break;
256                }
257                keys.swap(i, kmer_slot);
258                values.swap(i, kmer_slot);
259                aux_values.swap(i, kmer_slot);
260            }
261        }
262
263        BoomHashMap2 {
264            mphf,
265            keys,
266            values,
267            aux_values,
268        }
269    }
270
271    /// Create a new hash map from the parallel arrays `keys` and `values`, and `aux_values`
272    pub fn new(keys: Vec<K>, values: Vec<D1>, aux_values: Vec<D2>) -> BoomHashMap2<K, D1, D2> {
273        let mphf = Mphf::new(1.7, &keys);
274        Self::create_map(keys, values, aux_values, mphf)
275    }
276
277    pub fn get<Q: ?Sized>(&self, kmer: &Q) -> Option<(&D1, &D2)>
278    where
279        K: Borrow<Q>,
280        Q: Hash + Eq,
281    {
282        let maybe_pos = self.mphf.try_hash(kmer);
283        match maybe_pos {
284            Some(pos) => {
285                let hashed_kmer = &self.keys[pos as usize];
286                if kmer == hashed_kmer.borrow() {
287                    Some((&self.values[pos as usize], &self.aux_values[pos as usize]))
288                } else {
289                    None
290                }
291            }
292            None => None,
293        }
294    }
295
296    pub fn get_key_id<Q: ?Sized>(&self, kmer: &Q) -> Option<usize>
297    where
298        K: Borrow<Q>,
299        Q: Hash + Eq,
300    {
301        let maybe_pos = self.mphf.try_hash(kmer);
302        match maybe_pos {
303            Some(pos) => {
304                let hashed_kmer = &self.keys[pos as usize];
305                if kmer == hashed_kmer.borrow() {
306                    Some(pos as usize)
307                } else {
308                    None
309                }
310            }
311            None => None,
312        }
313    }
314
315    pub fn len(&self) -> usize {
316        self.keys.len()
317    }
318
319    pub fn is_empty(&self) -> bool {
320        self.keys.is_empty()
321    }
322
323    // Return iterator over key-values pairs
324    pub fn iter(&self) -> Boom2Iterator<K, D1, D2> {
325        Boom2Iterator {
326            hash: self,
327            index: 0,
328        }
329    }
330
331    pub fn get_key(&self, id: usize) -> Option<&K> {
332        let max_key_id = self.len();
333        if id > max_key_id {
334            None
335        } else {
336            Some(&self.keys[id])
337        }
338    }
339}
340
341impl<K, D1, D2> core::iter::FromIterator<(K, D1, D2)> for BoomHashMap2<K, D1, D2>
342where
343    K: Hash + Debug + PartialEq,
344    D1: Debug,
345    D2: Debug,
346{
347    fn from_iter<I: IntoIterator<Item = (K, D1, D2)>>(iter: I) -> Self {
348        let mut keys = Vec::new();
349        let mut values1 = Vec::new();
350        let mut values2 = Vec::new();
351
352        for (k, v1, v2) in iter {
353            keys.push(k);
354            values1.push(v1);
355            values2.push(v2);
356        }
357        Self::new(keys, values1, values2)
358    }
359}
360
361impl<K, D1, D2> BoomHashMap2<K, D1, D2>
362where
363    K: Hash + Debug + PartialEq + Send + Sync,
364    D1: Debug,
365    D2: Debug,
366{
367    /// Create a new hash map from the parallel arrays `keys` and `values`, and `aux_values`, using a parallel algorithm to construct the Mphf.
368    pub fn new_parallel(keys: Vec<K>, data: Vec<D1>, aux_data: Vec<D2>) -> BoomHashMap2<K, D1, D2> {
369        let mphf = Mphf::new_parallel(1.7, &keys, None);
370        Self::create_map(keys, data, aux_data, mphf)
371    }
372}
373
374/// A HashMap data structure where the mapping between keys and values is encoded in a Mphf. *Keys are not stored* - this can greatly improve the memory consumption,
375/// but can only be used if you can guarantee that you will only query for keys that were in the original set.  Querying for a new key will return a random value, silently.
376#[derive(Debug, Clone)]
377#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
378pub struct NoKeyBoomHashMap<K, D1> {
379    pub mphf: Mphf<K>,
380    pub values: Vec<D1>,
381}
382
383impl<K, D1> core::iter::FromIterator<(K, D1)> for NoKeyBoomHashMap<K, D1>
384where
385    K: Hash + Debug + PartialEq + Send + Sync,
386    D1: Debug,
387{
388    fn from_iter<I: IntoIterator<Item = (K, D1)>>(iter: I) -> Self {
389        let mut keys = Vec::new();
390        let mut values1 = Vec::new();
391
392        for (k, v1) in iter {
393            keys.push(k);
394            values1.push(v1);
395        }
396        Self::new_parallel(keys, values1)
397    }
398}
399
400impl<K, D1> NoKeyBoomHashMap<K, D1>
401where
402    K: Hash + Debug + PartialEq + Send + Sync,
403    D1: Debug,
404{
405    pub fn new_parallel(mut keys: Vec<K>, mut values: Vec<D1>) -> NoKeyBoomHashMap<K, D1> {
406        let mphf = Mphf::new_parallel(1.7, &keys, None);
407        for i in 0..keys.len() {
408            loop {
409                let kmer_slot = mphf.hash(&keys[i]) as usize;
410                if i == kmer_slot {
411                    break;
412                }
413                keys.swap(i, kmer_slot);
414                values.swap(i, kmer_slot);
415            }
416        }
417
418        NoKeyBoomHashMap { mphf, values }
419    }
420
421    pub fn new_with_mphf(mphf: Mphf<K>, values: Vec<D1>) -> NoKeyBoomHashMap<K, D1> {
422        NoKeyBoomHashMap { mphf, values }
423    }
424
425    /// Get the value associated with `key`, if available, otherwise return None
426    pub fn get<Q: ?Sized>(&self, kmer: &Q) -> Option<&D1>
427    where
428        K: Borrow<Q>,
429        Q: Hash + Eq,
430    {
431        let maybe_pos = self.mphf.try_hash(kmer);
432        match maybe_pos {
433            Some(pos) => Some(&self.values[pos as usize]),
434            _ => None,
435        }
436    }
437}
438
439/// A HashMap data structure where the mapping between keys and values is encoded in a Mphf. *Keys are not stored* - this can greatly improve the memory consumption,
440/// but can only be used if you can guarantee that you will only query for keys that were in the original set.  Querying for a new key will return a random value, silently.
441#[derive(Debug, Clone)]
442#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
443pub struct NoKeyBoomHashMap2<K, D1, D2> {
444    pub mphf: Mphf<K>,
445    pub values: Vec<D1>,
446    pub aux_values: Vec<D2>,
447}
448
449impl<K, D1, D2> core::iter::FromIterator<(K, D1, D2)> for NoKeyBoomHashMap2<K, D1, D2>
450where
451    K: Hash + Debug + PartialEq + Send + Sync,
452    D1: Debug,
453    D2: Debug,
454{
455    fn from_iter<I: IntoIterator<Item = (K, D1, D2)>>(iter: I) -> Self {
456        let mut keys = Vec::new();
457        let mut values1 = Vec::new();
458        let mut values2 = Vec::new();
459
460        for (k, v1, v2) in iter {
461            keys.push(k);
462            values1.push(v1);
463            values2.push(v2);
464        }
465        Self::new_parallel(keys, values1, values2)
466    }
467}
468
469impl<K, D1, D2> NoKeyBoomHashMap2<K, D1, D2>
470where
471    K: Hash + Debug + PartialEq + Send + Sync,
472    D1: Debug,
473    D2: Debug,
474{
475    pub fn new_parallel(
476        mut keys: Vec<K>,
477        mut values: Vec<D1>,
478        mut aux_values: Vec<D2>,
479    ) -> NoKeyBoomHashMap2<K, D1, D2> {
480        let mphf = Mphf::new_parallel(1.7, &keys, None);
481        for i in 0..keys.len() {
482            loop {
483                let kmer_slot = mphf.hash(&keys[i]) as usize;
484                if i == kmer_slot {
485                    break;
486                }
487                keys.swap(i, kmer_slot);
488                values.swap(i, kmer_slot);
489                aux_values.swap(i, kmer_slot);
490            }
491        }
492        NoKeyBoomHashMap2 {
493            mphf,
494            values,
495            aux_values,
496        }
497    }
498
499    pub fn new_with_mphf(
500        mphf: Mphf<K>,
501        values: Vec<D1>,
502        aux_values: Vec<D2>,
503    ) -> NoKeyBoomHashMap2<K, D1, D2> {
504        NoKeyBoomHashMap2 {
505            mphf,
506            values,
507            aux_values,
508        }
509    }
510
511    /// Get the value associated with `key`, if available, otherwise return None
512    pub fn get<Q: ?Sized>(&self, kmer: &Q) -> Option<(&D1, &D2)>
513    where
514        K: Borrow<Q>,
515        Q: Hash + Eq,
516    {
517        let maybe_pos = self.mphf.try_hash(kmer);
518        maybe_pos.map(|pos| (&self.values[pos as usize], &self.aux_values[pos as usize]))
519    }
520}