pi_phf_map/
lib.rs

1#![feature(test)]
2extern crate test;
3
4/// 完美hash表,要求k一定不重复
5/// 创建步骤:
6/// 1、先根据总数量来初始化扩大3-6倍的容量,算法为:len*3 向上取2的幂。
7/// 2、先用传入的哈希值,来计算所有k的hash,然后取高位右移落入容量范围,检查是否重复。
8/// 3、如果重复,则将哈希值作为种子,随机取下一个哈希值,继续尝试。
9/// 4、超过指定次数,默认为1000次,则扩容一倍,进行新的一轮尝试。
10/// 5、直到找到一个hash值,能让所有的k都不重复的落在容量空间。然后做前后空间的裁剪来节省内存。
11/// 测试get性能大概PhfMap为1ns, 标准的HashMap大概为3ns
12/// 测试PhfMap::new的性能, 10个kv为200~400ns,30个kv为300~5000ns,100个kv为63~80us,500个kv大概为0.5~0.8ms,可以通过初始大小来大幅提高new的性能,如果想更快的创建,则可以考虑,将k分段,减少单表的数量到16-32左右。
13/// 数组长度上, 10个kv一般为32,30个kv为128,100个kv为1024,500个kv大概为16384或32768
14use core::fmt::*;
15use core::hash::Hash;
16use core::ops::{Index, IndexMut};
17use fixedbitset::FixedBitSet;
18use fxhash::FxHasher64;
19use pi_null::Null;
20use pi_wy_rng::WyRng;
21use rand::{RngCore, SeedableRng};
22use std::{hash::Hasher, marker::PhantomData, mem};
23
24pub struct PhfMap<K: Hash, V: Null> {
25    /// 值数组,空位为Null的V
26    lut: Vec<V>,
27    /// 元素的数量
28    len: usize,
29    /// hasher
30    hasher: u64,
31    /// 右移值
32    right_shift: u32,
33    /// 起始的偏移量
34    offset: u32,
35    _k: PhantomData<K>,
36}
37
38// 当前默认的HashMap和HashSet(使用根据平台字长、和feature来决定的DefaultHasher)
39impl<K: Hash, V: Null> PhfMap<K, V> {
40    /// 用指定的kv键创建完美hash表
41    pub fn new(vec: Vec<(K, V)>) -> Self {
42        Self::with_hasher(vec, 0)
43    }
44    /// 用指定的KV迭代器和指定的hasher创建完美hash表
45    pub fn with_hasher(vec: Vec<(K, V)>, mut hasher: u64) -> Self {
46        // 经验判断, 一般键数组长度的3~6倍的初始容量会比2~4倍的容量,在创建速度上快10倍
47        // 扩大3~6倍
48        let len = vec.len();
49        if len == 0 {
50            return PhfMap {
51                lut: vec![],
52                len,
53                hasher,
54                right_shift: 0,
55                offset: 0,
56                _k: PhantomData,
57            }
58        }
59        let mut right_shift = (len as u64 * 3).leading_zeros();
60        let size = 1 << (u64::BITS - right_shift);
61        // println!("right_shift:{}, size:{:?}", right_shift, size);
62        let mut conflicts = FixedBitSet::with_capacity(size);
63        let mut count = 0;
64        let mut r = WyRng::seed_from_u64(hasher);
65        loop {
66            if Self::test(&vec, hasher, right_shift, &mut conflicts) {
67                // println!("count:{}, right_shift: {}, size:{:?}", count, right_shift, conflicts.len());
68                return Self::make(vec, hasher, right_shift, conflicts);
69            }
70            count += 1;
71            if count >= 1024 {
72                // 扩容
73                right_shift -= 1;
74                let size = 1 << (u64::BITS - right_shift);
75                conflicts = FixedBitSet::with_capacity(size);
76                hasher = r.next_u64();
77                count = 0;
78                continue;
79            }
80            conflicts.clear();
81            hasher = r.next_u64();
82        }
83    }
84    fn test(vec: &Vec<(K, V)>, hasher: u64, right_shift: u32, conflicts: &mut FixedBitSet) -> bool {
85        for (k, _v) in vec.iter() {
86            let i = Self::hash(k, hasher, right_shift);
87            if conflicts.contains(i) {
88                return false;
89            }
90            conflicts.set(i, true);
91        }
92        true
93    }
94    fn make(vec: Vec<(K, V)>, hasher: u64, right_shift: u32, conflicts: FixedBitSet) -> Self {
95        let mut offset = 0;
96        for i in 0..conflicts.len() {
97            if conflicts.contains(i) {
98                break;
99            }
100            offset += 1;
101        }
102        let mut end = conflicts.len();
103        for i in (0..conflicts.len()).rev() {
104            if conflicts.contains(i) {
105                break;
106            }
107            end -= 1;
108        }
109        let len = end - offset as usize;
110        let mut lut = Vec::with_capacity(len);
111        lut.extend((0..len).map(|_| V::null()));
112        for (k, v) in vec.into_iter() {
113            let i = Self::hash(&k, hasher, right_shift);
114            lut[i - offset as usize] = v;
115        }
116        PhfMap {
117            lut,
118            len,
119            hasher,
120            right_shift,
121            offset,
122            _k: PhantomData,
123        }
124    }
125    /// 获得kv表的容量
126    pub fn len(&self) -> usize {
127        self.len
128    }
129    /// 获得kv表的容量
130    pub fn capacity(&self) -> usize {
131        self.lut.len()
132    }
133    /// 获得指定键的只读引用
134    #[inline(always)]
135    pub fn get(&self, k: &K) -> Option<&V> {
136        let h = self.location(k);
137        self.lut.get(h)
138    }
139    /// 获得指定键的可写引用
140    #[inline(always)]
141    pub fn get_mut(&mut self, k: &K) -> Option<&mut V> {
142        let h = self.location(k);
143        self.lut.get_mut(h)
144    }
145    /// 获得指定键的只读引用
146    #[inline(always)]
147    pub unsafe fn get_unchecked(&self, k: &K) -> &V {
148        let h = self.location(k);
149        self.lut.get_unchecked(h)
150    }
151    /// 获得指定键的可写引用
152    #[inline(always)]
153    pub unsafe fn get_unchecked_mut(&mut self, k: &K) -> &mut V {
154        let h = self.location(k);
155        self.lut.get_unchecked_mut(h)
156    }
157    #[inline(always)]
158    pub fn val_iter(&self) -> Iter<'_, V> {
159        Iter {
160            lut: &self.lut,
161            idx: 0,
162            len: self.len,
163        }
164    }
165    #[inline(always)]
166    pub fn val_iter_mut(&mut self) -> IterMut<'_, V> {
167        IterMut {
168            lut: &mut self.lut,
169            idx: 0,
170            len: self.len,
171        }
172    }
173    #[inline(always)]
174    pub fn into_vec(self) -> Vec<V> {
175        self.lut
176    }
177    /// 获得k的hash
178    #[inline(always)]
179    fn location(&self, k: &K) -> usize {
180        Self::hash(k, self.hasher, self.right_shift).wrapping_sub(self.offset as usize)
181    }
182    /// 获得k的hash
183    #[inline(always)]
184    fn hash(k: &K, hasher: u64, right_shift: u32) -> usize {
185        let mut state: FxHasher64 = unsafe { mem::transmute(hasher) };
186        k.hash(&mut state);
187        (state.finish() >> right_shift) as usize
188    }
189}
190impl<K: Hash, V: Null> Index<K> for PhfMap<K, V> {
191    type Output = V;
192    fn index(&self, key: K) -> &V {
193        self.get(&key).unwrap()
194    }
195}
196impl<K: Hash, V: Null> IndexMut<K> for PhfMap<K, V> {
197    fn index_mut(&mut self, key: K) -> &mut V {
198        self.get_mut(&key).unwrap()
199    }
200}
201
202impl<K: Hash, V: Null + Clone> Clone for PhfMap<K, V> {
203    fn clone(&self) -> Self {
204        PhfMap {
205            lut: self.lut.clone(),
206            len: self.len,
207            hasher: self.hasher,
208            right_shift: self.right_shift,
209            offset: self.offset,
210            _k: PhantomData,
211        }
212    }
213}
214
215impl<K: Hash, V: Null + Debug> Debug for PhfMap<K, V> {
216    fn fmt(&self, f: &mut Formatter<'_>) -> Result {
217        let vec: Vec<&V> = self.val_iter().collect();
218        f.debug_struct("PhfMap")
219            .field("lut", &vec.as_slice())
220            .field("len", &self.len)
221            .field("hasher", &self.hasher)
222            .field("right_shift", &self.right_shift)
223            .field("offset", &self.offset)
224            .finish()
225    }
226}
227
228pub struct Iter<'a, T: Null> {
229    lut: &'a Vec<T>,
230    idx: usize,
231    len: usize,
232}
233
234impl<'a, T: Null> Iterator for Iter<'a, T> {
235    type Item = &'a T;
236    fn next(&mut self) -> Option<Self::Item> {
237        while self.idx < self.lut.len() {
238            let v = unsafe { self.lut.get_unchecked(self.idx) };
239            self.idx += 1;
240            if !v.is_null() {
241                return Some(v);
242            }
243        }
244        None
245    }
246    fn size_hint(&self) -> (usize, Option<usize>) {
247        (self.len, Some(self.len))
248    }
249}
250impl<T: Null + Debug> Debug for Iter<'_, T> {
251    fn fmt(&self, f: &mut Formatter<'_>) -> Result {
252        f.debug_tuple("Iter").field(&self.idx).finish()
253    }
254}
255
256pub struct IterMut<'a, V: Null> {
257    lut: &'a mut Vec<V>,
258    idx: usize,
259    len: usize,
260}
261
262impl<'a, V: Null> Iterator for IterMut<'a, V> {
263    type Item = &'a mut V;
264    fn next(&mut self) -> Option<Self::Item> {
265        let len = self.lut.len();
266        while self.idx < len {
267            let v = unsafe { self.lut.get_unchecked_mut(self.idx) as *mut V };
268            self.idx += 1;
269            if !v.is_null() {
270                return Some(unsafe { &mut *v });
271            }
272        }
273        None
274    }
275    fn size_hint(&self) -> (usize, Option<usize>) {
276        (self.len, Some(self.len))
277    }
278}
279#[cfg(test)]
280mod test_mod {
281    extern crate rand;
282
283    use crate::*;
284    use rand::Rng;
285    use test::Bencher;
286
287    #[test]
288    fn test() {
289        // 32 - 126
290        // 128 - 600
291        // 512 - 3200
292        //let mut rng = rand::thread_rng();
293        let mut rng = pi_wy_rng::WyRng::seed_from_u64(22222);
294        let mut arr = Vec::new();
295        for _ in 0..16 {
296            let k = rng.gen::<usize>();
297            arr.push((k, k + 1));
298        }
299        let map = PhfMap::with_hasher(arr.clone(), 0);
300        println!("map:{:?}", map);
301        for (k, v) in arr {
302            let n = map[k];
303            assert_eq!(n, v);
304        }
305    }
306
307    #[bench]
308    fn bench_make(b: &mut Bencher) {
309        use rand::Rng;
310
311        let mut rng = rand::thread_rng();
312        let mut arr = Vec::new();
313        for _ in 0..430 {
314            let k = rng.gen::<usize>();
315            arr.push((k, k + 1));
316        }
317        b.iter(move || {
318            let map = PhfMap::with_hasher(arr.clone(), 0);
319            for (k, v) in arr.iter() {
320                let n = map.get(&k).unwrap();
321                assert_eq!(n, v);
322            }
323        });
324    }
325    #[bench]
326    fn bench_test(b: &mut Bencher) {
327        use rand::Rng;
328        let mut rng = rand::thread_rng();
329        let mut arr = Vec::new();
330        for _ in 0..430 {
331            let k = rng.gen::<usize>();
332            arr.push((k, k + 1));
333        }
334        let map = PhfMap::with_hasher(arr.clone(), 0);
335        println!("map capacity:{}", map.capacity());
336        b.iter(move || {
337            for &(k, v) in &arr {
338                let n = map.get(&k).unwrap();
339                assert_eq!(*n, v);
340            }
341        });
342    }
343    #[bench]
344    fn bench_test_arr(b: &mut Bencher) {
345        use rand::Rng;
346        let mut rng = rand::thread_rng();
347        let mut arr = Vec::new();
348        for _ in 0..430 {
349            let k = rng.gen::<usize>();
350            arr.push((k, k + 1));
351        }
352        let mut suffle = arr.clone();
353        suffle.sort();
354        println!("arr len:{}", 1);
355        b.iter(move || {
356            for &(k, _v) in &arr {
357                let n = suffle.iter().position(|&x| x.0 == k);
358                assert!(n.is_some());
359            }
360        });
361    }
362    #[bench]
363    fn bench_test_map(b: &mut Bencher) {
364        use rand::Rng;
365        use std::collections::HashMap;
366        use std::hash::BuildHasherDefault;
367
368        pub type XHashMap<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher64>>;
369        let mut rng = rand::thread_rng();
370        let mut arr = Vec::new();
371        let mut map: XHashMap<usize, usize> = XHashMap::with_hasher(Default::default());
372        for _ in 0..430 {
373            let k = rng.gen::<usize>();
374            arr.push((k, k + 1));
375            map.insert(k, k + 1);
376        }
377        b.iter(move || {
378            for &(k, v) in &arr {
379                let n = map.get(&k);
380                assert_eq!(*n.unwrap(), v);
381            }
382        });
383    }
384}