inohashmap/
lib.rs

1/*! Stores values for strings in a Hashmap in a fast and compact way.
2
3Good to count strings and assign ids to them or similar. Address space of string data is limited to u32::MAX (4GB).
4string data is size in bytes of all uniquely inserted strings + string length metadata per string.
5
6# Examples
7```
8use inohashmap::StringHashMap;
9let mut hashmap = StringHashMap::<u32>::new();
10let val = hashmap.get_or_create("blub1", 0);
11assert_eq!(*val, 0);
12*val += 1;
13
14let val = hashmap.get_or_create("blub2", 2);
15assert_eq!(*val, 2);
16
17```
18
19*/
20
21use crate::bytesref::BytesRef;
22use crate::hasher::fnv32a_yoshimitsu_hasher;
23use core::fmt::Debug;
24use vint32::{decode_varint_slice, encode_varint_into};
25mod bytesref;
26pub mod hasher;
27
28#[derive(Debug)]
29pub struct StringHashMap<T> {
30    /// contains string in compressed format
31    pub(crate) string_data: Vec<u8>,
32    /// pointer to string data and value
33    pub(crate) table: Vec<TableEntry<T>>,
34    bitshift: usize,
35    pub occupied: usize,
36    mask: u32,
37}
38
39impl<T: Default + Clone + Debug> Default for StringHashMap<T> {
40    fn default() -> Self {
41        StringHashMap::with_power_of_two_size(10)
42    }
43}
44
45#[derive(Debug, Clone, Copy, Default)]
46pub(crate) struct TableEntry<T> {
47    value: T,
48    pointer: BytesRef,
49}
50
51impl<T: Default + Clone + Debug> StringHashMap<T> {
52    #[inline]
53    pub fn with_power_of_two_size(power_of_two: usize) -> Self {
54        let shift = power_of_two - 1;
55        let mut table = vec![];
56        table.resize(1 << shift, TableEntry::default());
57        StringHashMap {
58            string_data: Vec::with_capacity((1 << shift) * 2),
59            mask: table.len() as u32 - 1,
60            table,
61            bitshift: 32 - power_of_two,
62            occupied: 0,
63        }
64    }
65    #[inline]
66    pub fn new() -> Self {
67        Self::with_power_of_two_size(10)
68    }
69
70    #[inline]
71    pub fn len(&self) -> usize {
72        self.occupied
73    }
74
75    #[inline]
76    pub fn is_empty(&self) -> bool {
77        self.occupied == 0
78    }
79
80    #[inline]
81    pub fn shrink_to_fit(&mut self) {
82        self.string_data.shrink_to_fit();
83        self.table.shrink_to_fit();
84    }
85
86    #[inline]
87    pub fn get(&mut self, el: &str) -> Option<&T> {
88        let mut probe = self.get_probe(el);
89        let mut hash = probe.next_probe() as usize;
90
91        loop {
92            let entry = self.get_entry(hash);
93            if entry.pointer.is_null() {
94                return None;
95            } else if self.read_string(entry.pointer) == el {
96                return Some(&self.get_entry(hash as usize).value);
97            }
98            hash = probe.next_probe() as usize;
99        }
100    }
101    #[inline]
102    pub fn get_mut(&mut self, el: &str) -> Option<&mut T> {
103        let mut probe = self.get_probe(el);
104        let mut hash = probe.next_probe() as usize;
105
106        loop {
107            let entry = self.get_entry(hash);
108            if entry.pointer.is_null() {
109                return None;
110            } else if self.read_string(entry.pointer) == el {
111                return Some(&mut self.get_entry_mut(hash as usize).value);
112            }
113            hash = probe.next_probe() as usize;
114        }
115    }
116
117    #[inline]
118    pub fn get_or_create(&mut self, el: &str, value: T) -> &mut T {
119        // check load factor, resize when 0.5
120        // if self.occupied as f32 * 1.5 > self.table.len() as f32 {
121        if self.occupied as f32 * 1.5 > self.table.len() as f32 {
122            self.resize();
123        }
124        let mut probe = self.get_probe(el);
125        let mut hash = probe.next_probe() as usize;
126
127        loop {
128            let entry = self.get_entry(hash);
129            if entry.pointer.is_null() {
130                self.occupied += 1;
131                let inserted_value = self.put_in_bucket(hash as usize, el, value);
132                return &mut inserted_value.value;
133            } else if self.read_string(entry.pointer) == el {
134                return &mut self.get_entry_mut(hash as usize).value;
135            }
136            hash = probe.next_probe() as usize;
137        }
138    }
139
140    #[inline]
141    fn get_probe(&self, el: &str) -> QuadraticProbing {
142        let hash = fnv32a_yoshimitsu_hasher(el.as_bytes());
143        let hash = hash >> self.bitshift;
144        QuadraticProbing::compute(hash, self.mask)
145    }
146
147    #[inline]
148    fn put_entry_resize(&mut self, el: &str, new_entry: TableEntry<T>) {
149        let mut probe = self.get_probe(el);
150        let mut hash = probe.next_probe();
151        loop {
152            let entry = self.get_entry_mut(hash as usize);
153            if entry.pointer.is_null() {
154                entry.pointer = new_entry.pointer;
155                entry.value = new_entry.value;
156                return;
157            }
158            hash = probe.next_probe();
159        }
160    }
161
162    #[inline]
163    pub fn values(&self) -> impl Iterator<Item = &T> {
164        self.table
165            .iter()
166            .filter(|entry| !entry.pointer.is_null())
167            .map(|entry| &entry.value)
168    }
169    #[inline]
170    pub fn values_mut(&mut self) -> impl Iterator<Item = &mut T> {
171        self.table
172            .iter_mut()
173            .filter(|entry| !entry.pointer.is_null())
174            .map(|entry| &mut entry.value)
175    }
176    #[inline]
177    pub fn keys(&self) -> KeyIterator<'_, T> {
178        KeyIterator { map: self, pos: 0 }
179    }
180
181    #[inline]
182    pub fn iter(&self) -> impl Iterator<Item = (&str, &T)> {
183        self.table
184            .iter()
185            .filter(|entry| !entry.pointer.is_null())
186            .map(move |entry| (self.read_string(entry.pointer), &entry.value))
187    }
188
189    #[inline]
190    pub fn iter_mut(&mut self) -> impl Iterator<Item = (&str, &mut T)> {
191        // You bested me borrow checker
192        // Cast should be fine, since self lives als long as the iter and all data accessed in read_string is immutable
193        // I don't know why but mutable access doesn't work here without errors
194        // Should be possible to fix by creating an extra Iter struct like in keys
195        let cheated_self = unsafe { &*(self as *mut StringHashMap<T> as *const StringHashMap<T>)};
196        self.table
197            .iter_mut()
198            .filter(|entry| !entry.pointer.is_null())
199            .map(move |entry| {
200                let text = cheated_self.read_string(entry.pointer);
201                (text, &mut entry.value)
202            })
203    }
204
205    #[inline]
206    fn get_entry(&self, hash: usize) -> &TableEntry<T> {
207        unsafe { self.table.get_unchecked(hash) }
208    }
209    #[inline]
210    fn get_entry_mut(&mut self, hash: usize) -> &mut TableEntry<T> {
211        unsafe { self.table.get_unchecked_mut(hash as usize) }
212    }
213
214    /// Doubles the size of the table
215    /// Creates a new table and moves all entries to the new table
216    #[cold]
217    fn resize(&mut self) {
218        let mut table: Vec<TableEntry<T>> = vec![];
219        table.resize(self.table.len() * 2, TableEntry::default());
220        self.mask = table.len() as u32 - 1;
221
222        std::mem::swap(&mut self.table, &mut table);
223        self.bitshift -= 1;
224        for entry in table.into_iter().filter(|x| !x.pointer.is_null()) {
225            let text = self.read_string(entry.pointer);
226            // casting away lifetime of text
227            // Since string_data will not be altered in put_entry_resize
228            let text = unsafe { std::mem::transmute::<&str, &'static str>(text) };
229            self.put_entry_resize(text, entry);
230        }
231    }
232
233    #[inline]
234    fn put_in_bucket(&mut self, hash: usize, el: &str, value: T) -> &mut TableEntry<T> {
235        let pos = BytesRef(self.string_data.len() as u32);
236
237        encode_varint_into(&mut self.string_data, el.len() as u32);
238
239        self.string_data.extend_from_slice(el.as_bytes());
240        // unsafe {
241        //     self.string_data.reserve(el.len());
242        //     let target = self.string_data.as_mut_ptr().add(self.string_data.len());
243        //     std::ptr::copy_nonoverlapping(el.as_bytes().as_ptr(), target, el.as_bytes().len());
244        //     self.string_data.set_len(self.string_data.len()+ el.len() );
245        // };
246
247        let entry = self.get_entry_mut(hash);
248        *entry = TableEntry {
249            value,
250            pointer: pos,
251        };
252        entry
253    }
254
255    #[inline]
256    pub(crate) fn read_string(&self, pos: BytesRef) -> &str {
257        let mut pos = pos.addr() as usize;
258        let length_string = decode_varint_slice(&self.string_data, &mut pos).unwrap();
259        unsafe {
260            std::str::from_utf8_unchecked(
261                &self
262                    .string_data
263                    .get_unchecked(pos..pos + length_string as usize),
264            )
265        }
266    }
267}
268
269#[derive(Debug)]
270pub struct KeyIterator<'a, T> {
271    pub map: &'a StringHashMap<T>,
272    pos: usize,
273}
274
275impl<'a, T> Iterator for KeyIterator<'a, T> {
276    type Item = &'a str;
277
278    #[inline]
279    fn next(&mut self) -> Option<&'a str> {
280        if self.pos == self.map.string_data.len() {
281            None
282        } else {
283            let length_string = decode_varint_slice(&self.map.string_data, &mut self.pos).unwrap();
284            let text = unsafe {
285                std::str::from_utf8_unchecked(
286                    &self
287                        .map
288                        .string_data
289                        .get_unchecked(self.pos..self.pos + length_string as usize),
290                )
291            };
292            self.pos += length_string as usize;
293            Some(text)
294        }
295    }
296}
297
298struct QuadraticProbing {
299    hash: u32,
300    i: u32,
301    mask: u32,
302}
303
304impl QuadraticProbing {
305    #[inline]
306    fn compute(hash: u32, mask: u32) -> QuadraticProbing {
307        QuadraticProbing { hash, i: 1, mask }
308    }
309
310    #[inline]
311    fn next_probe(&mut self) -> u32 {
312        self.i += 1;
313        ((self.hash + (self.i + self.i * self.i)) >> 1) & self.mask
314        // (self.hash + (self.i * self.i)) & self.mask
315    }
316}
317
318#[cfg(test)]
319mod tests {
320    use super::*;
321    #[test]
322    fn get_values_big() {
323        use std::io::Read;
324
325        let mut contents = String::new();
326        std::fs::File::open("1342-0.txt")
327            .unwrap()
328            .read_to_string(&mut contents)
329            .unwrap();
330
331        let mut map = StringHashMap::<u32>::new();
332        let mut counter = 0;
333        for text in contents.split_whitespace() {
334            let value = map.get_or_create(text, 0);
335            *value += 1;
336            counter += 1;
337        }
338
339        let sum: u32 = map.values().sum();
340        assert_eq!(sum, counter);
341        assert_eq!(map.string_data.len() < 1_000_000, true);
342
343        dbg!(counter);
344
345        // let num_one_time_probe= map.num_probes.iter().filter(|el| *el == &1).cloned().sum::<u32>();
346        // let num_two_time_probe= map.num_probes.iter().filter(|el| *el == &2).cloned().sum::<u32>();
347        // let num_more_than_one_time_probe= map.num_probes.iter().filter(|el| *el != &1).cloned().sum::<u32>();
348        // dbg!(num_one_time_probe);
349        // dbg!(num_two_time_probe);
350        // dbg!(num_more_than_one_time_probe);
351        // dbg!(map.existing);
352    }
353    #[test]
354    fn values() {
355        let mut hashmap = StringHashMap::<u32>::new();
356        hashmap.get_or_create("blub", 1);
357
358        let val: u32 = hashmap.values().sum();
359        assert_eq!(val, 1);
360    }
361    #[test]
362    fn simple() {
363        let mut hashmap = StringHashMap::<u32>::new();
364        let val = hashmap.get_or_create("blub1", 0);
365        assert_eq!(*val, 0);
366        *val += 1;
367
368        let val = hashmap.get_or_create("blub2", 2);
369        assert_eq!(*val, 2);
370    }
371    #[test]
372    fn get_or_create() {
373        let mut hashmap = StringHashMap::<u32>::new();
374        let val = hashmap.get_or_create("blub", 0);
375        assert_eq!(*val, 0);
376        *val += 1;
377
378        let val = hashmap.get_or_create("blub", 0);
379        assert_eq!(*val, 1);
380    }
381    #[test]
382    fn test_resize() {
383        let mut hashmap = StringHashMap::<u32>::with_power_of_two_size(1);
384        hashmap.get_or_create("blub1", 3);
385        hashmap.get_or_create("blub2", 4);
386
387        assert_eq!(hashmap.get_or_create("blub1", 3), &3);
388
389        //should resize
390        let val = hashmap.get_or_create("blub3", 5);
391        assert_eq!(*val, 5);
392
393        // // check values after resize
394        assert_eq!(hashmap.get_or_create("blub1", 0), &3);
395        assert_eq!(hashmap.get_or_create("blub2", 0), &4);
396        assert_eq!(hashmap.get_or_create("blub3", 0), &5);
397    }
398    #[test]
399    fn test_iter() {
400        let mut hashmap = StringHashMap::<u32>::with_power_of_two_size(1);
401        hashmap.get_or_create("blub1", 3);
402        hashmap.get_or_create("blub2", 4);
403        hashmap.get_or_create("blub3", 5);
404
405        assert_eq!(hashmap.get_or_create("blub1", 0), &3);
406        assert_eq!(hashmap.get_or_create("blub2", 0), &4);
407        assert_eq!(hashmap.get_or_create("blub3", 0), &5);
408
409        assert_eq!(
410            hashmap.keys().collect::<Vec<_>>(),
411            &["blub1", "blub2", "blub3"]
412        );
413        assert_eq!(hashmap.values().collect::<Vec<_>>(), &[&5, &4, &3]);
414        assert_eq!(hashmap.values_mut().collect::<Vec<_>>(), &[&5, &4, &3]);
415        assert_eq!(
416            hashmap.iter().collect::<Vec<_>>(),
417            &[("blub3", &5), ("blub2", &4), ("blub1", &3),]
418        );
419        assert_eq!(
420            hashmap.iter_mut().collect::<Vec<_>>(),
421            &[("blub3", &mut 5), ("blub2", &mut 4), ("blub1", &mut 3),]
422        );
423    }
424
425    #[test]
426    fn test_get() {
427        let mut hashmap = StringHashMap::<u32>::with_power_of_two_size(1);
428        hashmap.get_or_create("blub1", 3);
429        hashmap.get_or_create("blub2", 4);
430        hashmap.get_or_create("blub3", 5);
431
432        assert_eq!(hashmap.get_or_create("blub1", 0), &3);
433        assert_eq!(hashmap.get_or_create("blub2", 0), &4);
434        assert_eq!(hashmap.get_mut("blub3"), Some(&mut 5));
435        assert_eq!(hashmap.get("blub3"), Some(&5));
436        assert_eq!(hashmap.get("blub1000"), None);
437        assert_eq!(hashmap.get_mut("blub1000"), None);
438
439        hashmap.shrink_to_fit();
440    }
441    #[test]
442    fn test_len() {
443        let mut hashmap = StringHashMap::<u32>::with_power_of_two_size(1);
444        assert_eq!(hashmap.is_empty(), true);
445        hashmap.get_or_create("blub1", 3);
446        assert_eq!(hashmap.len(), 1);
447        assert_eq!(hashmap.is_empty(), false);
448        hashmap.get_or_create("blub2", 4);
449        hashmap.get_or_create("blub3", 5);
450        assert_eq!(hashmap.len(), 3);
451        assert_eq!(hashmap.is_empty(), false);
452    }
453}