1use crate::bytesref::BytesRef;
22use crate::hasher::fnv32a_yoshimitsu_hasher;
23use core::fmt::Debug;
24use vint32::{decode_varint_slice, encode_varint_into};
25mod bytesref;
26pub mod hasher;
27
28#[derive(Debug)]
29pub struct StringHashMap<T> {
30 pub(crate) string_data: Vec<u8>,
32 pub(crate) table: Vec<TableEntry<T>>,
34 bitshift: usize,
35 pub occupied: usize,
36 mask: u32,
37}
38
39impl<T: Default + Clone + Debug> Default for StringHashMap<T> {
40 fn default() -> Self {
41 StringHashMap::with_power_of_two_size(10)
42 }
43}
44
45#[derive(Debug, Clone, Copy, Default)]
46pub(crate) struct TableEntry<T> {
47 value: T,
48 pointer: BytesRef,
49}
50
51impl<T: Default + Clone + Debug> StringHashMap<T> {
52 #[inline]
53 pub fn with_power_of_two_size(power_of_two: usize) -> Self {
54 let shift = power_of_two - 1;
55 let mut table = vec![];
56 table.resize(1 << shift, TableEntry::default());
57 StringHashMap {
58 string_data: Vec::with_capacity((1 << shift) * 2),
59 mask: table.len() as u32 - 1,
60 table,
61 bitshift: 32 - power_of_two,
62 occupied: 0,
63 }
64 }
65 #[inline]
66 pub fn new() -> Self {
67 Self::with_power_of_two_size(10)
68 }
69
70 #[inline]
71 pub fn len(&self) -> usize {
72 self.occupied
73 }
74
75 #[inline]
76 pub fn is_empty(&self) -> bool {
77 self.occupied == 0
78 }
79
80 #[inline]
81 pub fn shrink_to_fit(&mut self) {
82 self.string_data.shrink_to_fit();
83 self.table.shrink_to_fit();
84 }
85
86 #[inline]
87 pub fn get(&mut self, el: &str) -> Option<&T> {
88 let mut probe = self.get_probe(el);
89 let mut hash = probe.next_probe() as usize;
90
91 loop {
92 let entry = self.get_entry(hash);
93 if entry.pointer.is_null() {
94 return None;
95 } else if self.read_string(entry.pointer) == el {
96 return Some(&self.get_entry(hash as usize).value);
97 }
98 hash = probe.next_probe() as usize;
99 }
100 }
101 #[inline]
102 pub fn get_mut(&mut self, el: &str) -> Option<&mut T> {
103 let mut probe = self.get_probe(el);
104 let mut hash = probe.next_probe() as usize;
105
106 loop {
107 let entry = self.get_entry(hash);
108 if entry.pointer.is_null() {
109 return None;
110 } else if self.read_string(entry.pointer) == el {
111 return Some(&mut self.get_entry_mut(hash as usize).value);
112 }
113 hash = probe.next_probe() as usize;
114 }
115 }
116
117 #[inline]
118 pub fn get_or_create(&mut self, el: &str, value: T) -> &mut T {
119 if self.occupied as f32 * 1.5 > self.table.len() as f32 {
122 self.resize();
123 }
124 let mut probe = self.get_probe(el);
125 let mut hash = probe.next_probe() as usize;
126
127 loop {
128 let entry = self.get_entry(hash);
129 if entry.pointer.is_null() {
130 self.occupied += 1;
131 let inserted_value = self.put_in_bucket(hash as usize, el, value);
132 return &mut inserted_value.value;
133 } else if self.read_string(entry.pointer) == el {
134 return &mut self.get_entry_mut(hash as usize).value;
135 }
136 hash = probe.next_probe() as usize;
137 }
138 }
139
140 #[inline]
141 fn get_probe(&self, el: &str) -> QuadraticProbing {
142 let hash = fnv32a_yoshimitsu_hasher(el.as_bytes());
143 let hash = hash >> self.bitshift;
144 QuadraticProbing::compute(hash, self.mask)
145 }
146
147 #[inline]
148 fn put_entry_resize(&mut self, el: &str, new_entry: TableEntry<T>) {
149 let mut probe = self.get_probe(el);
150 let mut hash = probe.next_probe();
151 loop {
152 let entry = self.get_entry_mut(hash as usize);
153 if entry.pointer.is_null() {
154 entry.pointer = new_entry.pointer;
155 entry.value = new_entry.value;
156 return;
157 }
158 hash = probe.next_probe();
159 }
160 }
161
162 #[inline]
163 pub fn values(&self) -> impl Iterator<Item = &T> {
164 self.table
165 .iter()
166 .filter(|entry| !entry.pointer.is_null())
167 .map(|entry| &entry.value)
168 }
169 #[inline]
170 pub fn values_mut(&mut self) -> impl Iterator<Item = &mut T> {
171 self.table
172 .iter_mut()
173 .filter(|entry| !entry.pointer.is_null())
174 .map(|entry| &mut entry.value)
175 }
176 #[inline]
177 pub fn keys(&self) -> KeyIterator<'_, T> {
178 KeyIterator { map: self, pos: 0 }
179 }
180
181 #[inline]
182 pub fn iter(&self) -> impl Iterator<Item = (&str, &T)> {
183 self.table
184 .iter()
185 .filter(|entry| !entry.pointer.is_null())
186 .map(move |entry| (self.read_string(entry.pointer), &entry.value))
187 }
188
189 #[inline]
190 pub fn iter_mut(&mut self) -> impl Iterator<Item = (&str, &mut T)> {
191 let cheated_self = unsafe { &*(self as *mut StringHashMap<T> as *const StringHashMap<T>)};
196 self.table
197 .iter_mut()
198 .filter(|entry| !entry.pointer.is_null())
199 .map(move |entry| {
200 let text = cheated_self.read_string(entry.pointer);
201 (text, &mut entry.value)
202 })
203 }
204
205 #[inline]
206 fn get_entry(&self, hash: usize) -> &TableEntry<T> {
207 unsafe { self.table.get_unchecked(hash) }
208 }
209 #[inline]
210 fn get_entry_mut(&mut self, hash: usize) -> &mut TableEntry<T> {
211 unsafe { self.table.get_unchecked_mut(hash as usize) }
212 }
213
214 #[cold]
217 fn resize(&mut self) {
218 let mut table: Vec<TableEntry<T>> = vec![];
219 table.resize(self.table.len() * 2, TableEntry::default());
220 self.mask = table.len() as u32 - 1;
221
222 std::mem::swap(&mut self.table, &mut table);
223 self.bitshift -= 1;
224 for entry in table.into_iter().filter(|x| !x.pointer.is_null()) {
225 let text = self.read_string(entry.pointer);
226 let text = unsafe { std::mem::transmute::<&str, &'static str>(text) };
229 self.put_entry_resize(text, entry);
230 }
231 }
232
233 #[inline]
234 fn put_in_bucket(&mut self, hash: usize, el: &str, value: T) -> &mut TableEntry<T> {
235 let pos = BytesRef(self.string_data.len() as u32);
236
237 encode_varint_into(&mut self.string_data, el.len() as u32);
238
239 self.string_data.extend_from_slice(el.as_bytes());
240 let entry = self.get_entry_mut(hash);
248 *entry = TableEntry {
249 value,
250 pointer: pos,
251 };
252 entry
253 }
254
255 #[inline]
256 pub(crate) fn read_string(&self, pos: BytesRef) -> &str {
257 let mut pos = pos.addr() as usize;
258 let length_string = decode_varint_slice(&self.string_data, &mut pos).unwrap();
259 unsafe {
260 std::str::from_utf8_unchecked(
261 &self
262 .string_data
263 .get_unchecked(pos..pos + length_string as usize),
264 )
265 }
266 }
267}
268
269#[derive(Debug)]
270pub struct KeyIterator<'a, T> {
271 pub map: &'a StringHashMap<T>,
272 pos: usize,
273}
274
275impl<'a, T> Iterator for KeyIterator<'a, T> {
276 type Item = &'a str;
277
278 #[inline]
279 fn next(&mut self) -> Option<&'a str> {
280 if self.pos == self.map.string_data.len() {
281 None
282 } else {
283 let length_string = decode_varint_slice(&self.map.string_data, &mut self.pos).unwrap();
284 let text = unsafe {
285 std::str::from_utf8_unchecked(
286 &self
287 .map
288 .string_data
289 .get_unchecked(self.pos..self.pos + length_string as usize),
290 )
291 };
292 self.pos += length_string as usize;
293 Some(text)
294 }
295 }
296}
297
298struct QuadraticProbing {
299 hash: u32,
300 i: u32,
301 mask: u32,
302}
303
304impl QuadraticProbing {
305 #[inline]
306 fn compute(hash: u32, mask: u32) -> QuadraticProbing {
307 QuadraticProbing { hash, i: 1, mask }
308 }
309
310 #[inline]
311 fn next_probe(&mut self) -> u32 {
312 self.i += 1;
313 ((self.hash + (self.i + self.i * self.i)) >> 1) & self.mask
314 }
316}
317
318#[cfg(test)]
319mod tests {
320 use super::*;
321 #[test]
322 fn get_values_big() {
323 use std::io::Read;
324
325 let mut contents = String::new();
326 std::fs::File::open("1342-0.txt")
327 .unwrap()
328 .read_to_string(&mut contents)
329 .unwrap();
330
331 let mut map = StringHashMap::<u32>::new();
332 let mut counter = 0;
333 for text in contents.split_whitespace() {
334 let value = map.get_or_create(text, 0);
335 *value += 1;
336 counter += 1;
337 }
338
339 let sum: u32 = map.values().sum();
340 assert_eq!(sum, counter);
341 assert_eq!(map.string_data.len() < 1_000_000, true);
342
343 dbg!(counter);
344
345 }
353 #[test]
354 fn values() {
355 let mut hashmap = StringHashMap::<u32>::new();
356 hashmap.get_or_create("blub", 1);
357
358 let val: u32 = hashmap.values().sum();
359 assert_eq!(val, 1);
360 }
361 #[test]
362 fn simple() {
363 let mut hashmap = StringHashMap::<u32>::new();
364 let val = hashmap.get_or_create("blub1", 0);
365 assert_eq!(*val, 0);
366 *val += 1;
367
368 let val = hashmap.get_or_create("blub2", 2);
369 assert_eq!(*val, 2);
370 }
371 #[test]
372 fn get_or_create() {
373 let mut hashmap = StringHashMap::<u32>::new();
374 let val = hashmap.get_or_create("blub", 0);
375 assert_eq!(*val, 0);
376 *val += 1;
377
378 let val = hashmap.get_or_create("blub", 0);
379 assert_eq!(*val, 1);
380 }
381 #[test]
382 fn test_resize() {
383 let mut hashmap = StringHashMap::<u32>::with_power_of_two_size(1);
384 hashmap.get_or_create("blub1", 3);
385 hashmap.get_or_create("blub2", 4);
386
387 assert_eq!(hashmap.get_or_create("blub1", 3), &3);
388
389 let val = hashmap.get_or_create("blub3", 5);
391 assert_eq!(*val, 5);
392
393 assert_eq!(hashmap.get_or_create("blub1", 0), &3);
395 assert_eq!(hashmap.get_or_create("blub2", 0), &4);
396 assert_eq!(hashmap.get_or_create("blub3", 0), &5);
397 }
398 #[test]
399 fn test_iter() {
400 let mut hashmap = StringHashMap::<u32>::with_power_of_two_size(1);
401 hashmap.get_or_create("blub1", 3);
402 hashmap.get_or_create("blub2", 4);
403 hashmap.get_or_create("blub3", 5);
404
405 assert_eq!(hashmap.get_or_create("blub1", 0), &3);
406 assert_eq!(hashmap.get_or_create("blub2", 0), &4);
407 assert_eq!(hashmap.get_or_create("blub3", 0), &5);
408
409 assert_eq!(
410 hashmap.keys().collect::<Vec<_>>(),
411 &["blub1", "blub2", "blub3"]
412 );
413 assert_eq!(hashmap.values().collect::<Vec<_>>(), &[&5, &4, &3]);
414 assert_eq!(hashmap.values_mut().collect::<Vec<_>>(), &[&5, &4, &3]);
415 assert_eq!(
416 hashmap.iter().collect::<Vec<_>>(),
417 &[("blub3", &5), ("blub2", &4), ("blub1", &3),]
418 );
419 assert_eq!(
420 hashmap.iter_mut().collect::<Vec<_>>(),
421 &[("blub3", &mut 5), ("blub2", &mut 4), ("blub1", &mut 3),]
422 );
423 }
424
425 #[test]
426 fn test_get() {
427 let mut hashmap = StringHashMap::<u32>::with_power_of_two_size(1);
428 hashmap.get_or_create("blub1", 3);
429 hashmap.get_or_create("blub2", 4);
430 hashmap.get_or_create("blub3", 5);
431
432 assert_eq!(hashmap.get_or_create("blub1", 0), &3);
433 assert_eq!(hashmap.get_or_create("blub2", 0), &4);
434 assert_eq!(hashmap.get_mut("blub3"), Some(&mut 5));
435 assert_eq!(hashmap.get("blub3"), Some(&5));
436 assert_eq!(hashmap.get("blub1000"), None);
437 assert_eq!(hashmap.get_mut("blub1000"), None);
438
439 hashmap.shrink_to_fit();
440 }
441 #[test]
442 fn test_len() {
443 let mut hashmap = StringHashMap::<u32>::with_power_of_two_size(1);
444 assert_eq!(hashmap.is_empty(), true);
445 hashmap.get_or_create("blub1", 3);
446 assert_eq!(hashmap.len(), 1);
447 assert_eq!(hashmap.is_empty(), false);
448 hashmap.get_or_create("blub2", 4);
449 hashmap.get_or_create("blub3", 5);
450 assert_eq!(hashmap.len(), 3);
451 assert_eq!(hashmap.is_empty(), false);
452 }
453}