xorfilter/
xor8.rs

1//! Library implements xor-filter.
2//!
3//! This is a port of its
4//! [original implementation](https://github.com/FastFilter/xorfilter)
5//! written in golang.
6
7#[allow(unused_imports)]
8use std::collections::hash_map::{DefaultHasher, RandomState};
9use std::{
10    collections::BTreeMap,
11    convert::TryInto,
12    ffi, fs,
13    hash::{BuildHasher, Hash, Hasher},
14    io::{self, ErrorKind, Read, Write},
15};
16
17use crate::{BuildHasherDefault, Result};
18
19fn murmur64(mut h: u64) -> u64 {
20    h ^= h >> 33;
21    h = h.wrapping_mul(0xff51_afd7_ed55_8ccd);
22    h ^= h >> 33;
23    h = h.wrapping_mul(0xc4ce_b9fe_1a85_ec53);
24    h ^= h >> 33;
25    h
26}
27
28// returns random number, modifies the seed
29fn splitmix64(seed: &mut u64) -> u64 {
30    *seed = (*seed).wrapping_add(0x9E37_79B9_7F4A_7C15);
31    let mut z = *seed;
32    z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
33    z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
34    z ^ (z >> 31)
35}
36
37fn mixsplit(key: u64, seed: u64) -> u64 {
38    murmur64(key.wrapping_add(seed))
39}
40
41fn reduce(hash: u32, n: u32) -> u32 {
42    // http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
43    (((hash as u64) * (n as u64)) >> 32) as u32
44}
45
46fn fingerprint(hash: u64) -> u64 {
47    hash ^ (hash >> 32)
48}
49
50#[derive(Clone, Default)]
51struct XorSet {
52    xor_mask: u64,
53    count: u32,
54}
55
56#[derive(Default)]
57struct Hashes {
58    h: u64,
59    h0: u32,
60    h1: u32,
61    h2: u32,
62}
63
64#[derive(Clone, Copy, Default)]
65struct KeyIndex {
66    hash: u64,
67    index: u32,
68}
69
70/// Type Xor8 is probabilistic data-structure to test membership of an element in a set.
71///
72/// This implementation has a false positive rate of about 0.3% and a memory usage of
73/// less than 9 bits per entry for sizeable sets.
74///
75/// Xor8 is parametrized over type `H` which is expected to implement [BuildHasher]
76/// trait, like types [RandomState] and [BuildHasherDefault]. When not supplied,
77/// [BuildHasherDefault] is used as the default hash-builder.
78///
79/// If `RandomState` is used as BuildHasher, `std` has got this to say
80/// > _A particular instance RandomState will create the same instances
81/// > of Hasher, but the hashers created by two different RandomState_
82/// > instances are unlikely to produce the same result for the same values._
83///
84/// If [DefaultHasher] is used as BuildHasher, `std` has got this to say,
85/// > _The internal algorithm is not specified, and so its hashes
86/// > should not be relied upon over releases._
87///
88/// The default type for parameter `H` might change when a reliable and commonly used
89/// BuildHasher type is available.
90pub struct Xor8<H = BuildHasherDefault>
91where
92    H: BuildHasher,
93{
94    keys: Option<BTreeMap<u64, ()>>,
95    pub hash_builder: H,
96    pub seed: u64,
97    pub block_length: u32,
98    pub finger_prints: Vec<u8>,
99}
100
101impl<H> PartialEq for Xor8<H>
102where
103    H: BuildHasher,
104{
105    fn eq(&self, other: &Self) -> bool {
106        self.seed == other.seed
107            && self.block_length == other.block_length
108            && self.finger_prints == other.finger_prints
109    }
110}
111
112impl<H> Default for Xor8<H>
113where
114    H: BuildHasher + Default,
115{
116    fn default() -> Self {
117        Xor8 {
118            keys: Some(BTreeMap::new()),
119            hash_builder: H::default(),
120            seed: u64::default(),
121            block_length: u32::default(),
122            finger_prints: Vec::default(),
123        }
124    }
125}
126
127impl<H> Xor8<H>
128where
129    H: BuildHasher,
130{
131    /// New Xor8 instance initialized with [DefaultHasher].
132    pub fn new() -> Self
133    where
134        H: Default,
135    {
136        Self::default()
137    }
138
139    /// New Xor8 instance initialized with supplied `hasher`.
140    pub fn with_hasher(hash_builder: H) -> Self {
141        Xor8 {
142            keys: Some(BTreeMap::new()),
143            hash_builder,
144            seed: u64::default(),
145            block_length: u32::default(),
146            finger_prints: Vec::default(),
147        }
148    }
149}
150
151impl<H> Xor8<H>
152where
153    H: BuildHasher,
154{
155    /// Insert 64-bit digest of a single key. Digest for the key shall be generated
156    /// using the default-hasher or via hasher supplied via [Xor8::with_hasher] method.
157    pub fn insert<K: ?Sized + Hash>(&mut self, key: &K) {
158        let hashed_key = {
159            let mut hasher = self.hash_builder.build_hasher();
160            key.hash(&mut hasher);
161            hasher.finish()
162        };
163        self.keys.as_mut().unwrap().insert(hashed_key, ());
164    }
165
166    /// Populate with 64-bit digests for a collection of keys of type `K`. Digest for
167    /// key shall be generated using the default-hasher or via hasher supplied
168    /// via [Xor8::with_hasher] method.
169    pub fn populate<K: Hash>(&mut self, keys: &[K]) {
170        keys.iter().for_each(|key| {
171            let mut hasher = self.hash_builder.build_hasher();
172            key.hash(&mut hasher);
173            self.keys.as_mut().unwrap().insert(hasher.finish(), ());
174        })
175    }
176
177    /// Populate with pre-compute collection of 64-bit digests.
178    pub fn populate_keys(&mut self, digests: &[u64]) {
179        for digest in digests.iter() {
180            self.keys.as_mut().unwrap().insert(*digest, ());
181        }
182    }
183
184    /// Build bitmap for keys that where previously inserted using [Xor8::insert],
185    /// [Xor8::populate] and [Xor8::populate_keys] method.
186    pub fn build(&mut self) -> Result<()> {
187        match self.keys.take() {
188            Some(keys) => {
189                let digests = keys.iter().map(|(k, _)| *k).collect::<Vec<u64>>();
190                self.build_keys(&digests)
191            }
192            None => Ok(()),
193        }
194    }
195
196    /// Build a bitmap for pre-computed 64-bit digests for keys. If keys where
197    /// previously inserted using [Xor8::insert] or [Xor8::populate] or
198    /// [Xor8::populate_keys] methods, they shall be ignored.
199    ///
200    /// It is upto the caller to ensure that digests are unique, that there no
201    /// duplicates.
202    pub fn build_keys(&mut self, digests: &[u64]) -> Result<()> {
203        let (size, mut rngcounter) = (digests.len(), 1_u64);
204        let capacity = {
205            let capacity = 32 + ((1.23 * (size as f64)).ceil() as u32);
206            capacity / 3 * 3 // round it down to a multiple of 3
207        };
208        self.seed = splitmix64(&mut rngcounter);
209        self.block_length = capacity / 3;
210        self.finger_prints = vec![u8::default(); capacity as usize];
211
212        let block_length = self.block_length as usize;
213        let mut q0: Vec<KeyIndex> = Vec::with_capacity(block_length);
214        let mut q1: Vec<KeyIndex> = Vec::with_capacity(block_length);
215        let mut q2: Vec<KeyIndex> = Vec::with_capacity(block_length);
216        let mut stack: Vec<KeyIndex> = Vec::with_capacity(size);
217        let mut sets0: Vec<XorSet> = vec![XorSet::default(); block_length];
218        let mut sets1: Vec<XorSet> = vec![XorSet::default(); block_length];
219        let mut sets2: Vec<XorSet> = vec![XorSet::default(); block_length];
220
221        loop {
222            for key in digests.iter() {
223                let hs = self.geth0h1h2(*key);
224                sets0[hs.h0 as usize].xor_mask ^= hs.h;
225                sets0[hs.h0 as usize].count += 1;
226                sets1[hs.h1 as usize].xor_mask ^= hs.h;
227                sets1[hs.h1 as usize].count += 1;
228                sets2[hs.h2 as usize].xor_mask ^= hs.h;
229                sets2[hs.h2 as usize].count += 1;
230            }
231
232            q0.clear();
233            q1.clear();
234            q2.clear();
235
236            let iter = sets0.iter().enumerate().take(self.block_length as usize);
237            for (i, item) in iter {
238                if item.count == 1 {
239                    q0.push(KeyIndex {
240                        index: i as u32,
241                        hash: item.xor_mask,
242                    });
243                }
244            }
245            let iter = sets1.iter().enumerate().take(self.block_length as usize);
246            for (i, item) in iter {
247                if item.count == 1 {
248                    q1.push(KeyIndex {
249                        index: i as u32,
250                        hash: item.xor_mask,
251                    });
252                }
253            }
254            let iter = sets2.iter().enumerate().take(self.block_length as usize);
255            for (i, item) in iter {
256                if item.count == 1 {
257                    q2.push(KeyIndex {
258                        index: i as u32,
259                        hash: item.xor_mask,
260                    });
261                }
262            }
263
264            stack.clear();
265
266            while !q0.is_empty() || !q1.is_empty() || !q2.is_empty() {
267                while let Some(keyindexvar) = q0.pop() {
268                    if sets0[keyindexvar.index as usize].count == 0 {
269                        // not actually possible after the initial scan.
270                        continue;
271                    }
272                    let hash = keyindexvar.hash;
273                    let h1 = self.geth1(hash);
274                    let h2 = self.geth2(hash);
275                    stack.push(keyindexvar);
276
277                    let mut s = unsafe { sets1.get_unchecked_mut(h1 as usize) };
278                    s.xor_mask ^= hash;
279                    s.count -= 1;
280                    if s.count == 1 {
281                        q1.push(KeyIndex {
282                            index: h1,
283                            hash: s.xor_mask,
284                        })
285                    }
286
287                    let mut s = unsafe { sets2.get_unchecked_mut(h2 as usize) };
288                    s.xor_mask ^= hash;
289                    s.count -= 1;
290                    if s.count == 1 {
291                        q2.push(KeyIndex {
292                            index: h2,
293                            hash: s.xor_mask,
294                        })
295                    }
296                }
297                while let Some(mut keyindexvar) = q1.pop() {
298                    if sets1[keyindexvar.index as usize].count == 0 {
299                        continue;
300                    }
301                    let hash = keyindexvar.hash;
302                    let h0 = self.geth0(hash);
303                    let h2 = self.geth2(hash);
304                    keyindexvar.index += self.block_length;
305                    stack.push(keyindexvar);
306
307                    let mut s = unsafe { sets0.get_unchecked_mut(h0 as usize) };
308                    s.xor_mask ^= hash;
309                    s.count -= 1;
310                    if s.count == 1 {
311                        q0.push(KeyIndex {
312                            index: h0,
313                            hash: s.xor_mask,
314                        })
315                    }
316
317                    let mut s = unsafe { sets2.get_unchecked_mut(h2 as usize) };
318                    s.xor_mask ^= hash;
319                    s.count -= 1;
320                    if s.count == 1 {
321                        q2.push(KeyIndex {
322                            index: h2,
323                            hash: s.xor_mask,
324                        })
325                    }
326                }
327                while let Some(mut keyindexvar) = q2.pop() {
328                    if sets2[keyindexvar.index as usize].count == 0 {
329                        continue;
330                    }
331                    let hash = keyindexvar.hash;
332                    let h0 = self.geth0(hash);
333                    let h1 = self.geth1(hash);
334                    keyindexvar.index += 2 * self.block_length;
335                    stack.push(keyindexvar);
336
337                    let mut s = unsafe { sets0.get_unchecked_mut(h0 as usize) };
338                    s.xor_mask ^= hash;
339                    s.count -= 1;
340                    if s.count == 1 {
341                        q0.push(KeyIndex {
342                            index: h0,
343                            hash: s.xor_mask,
344                        })
345                    }
346                    let mut s = unsafe { sets1.get_unchecked_mut(h1 as usize) };
347                    s.xor_mask ^= hash;
348                    s.count -= 1;
349                    if s.count == 1 {
350                        q1.push(KeyIndex {
351                            index: h1,
352                            hash: s.xor_mask,
353                        })
354                    }
355                }
356            }
357
358            if stack.len() == size {
359                break;
360            }
361
362            for item in sets0.iter_mut() {
363                *item = XorSet::default();
364            }
365            for item in sets1.iter_mut() {
366                *item = XorSet::default();
367            }
368            for item in sets2.iter_mut() {
369                *item = XorSet::default();
370            }
371            self.seed = splitmix64(&mut rngcounter)
372        }
373
374        while let Some(ki) = stack.pop() {
375            let mut val = fingerprint(ki.hash) as u8;
376            if ki.index < self.block_length {
377                let h1 = (self.geth1(ki.hash) + self.block_length) as usize;
378                let h2 = (self.geth2(ki.hash) + 2 * self.block_length) as usize;
379                val ^= self.finger_prints[h1] ^ self.finger_prints[h2];
380            } else if ki.index < 2 * self.block_length {
381                let h0 = self.geth0(ki.hash) as usize;
382                let h2 = (self.geth2(ki.hash) + 2 * self.block_length) as usize;
383                val ^= self.finger_prints[h0] ^ self.finger_prints[h2];
384            } else {
385                let h0 = self.geth0(ki.hash) as usize;
386                let h1 = (self.geth1(ki.hash) + self.block_length) as usize;
387                val ^= self.finger_prints[h0] ^ self.finger_prints[h1]
388            }
389            self.finger_prints[ki.index as usize] = val;
390        }
391
392        Ok(())
393    }
394}
395
396impl<H> Xor8<H>
397where
398    H: BuildHasher,
399{
400    /// Contains tell you whether the key is likely part of the set, with false
401    /// positive rate.
402    pub fn contains<K: ?Sized + Hash>(&self, key: &K) -> bool {
403        let hashed_key = {
404            let mut hasher = self.hash_builder.build_hasher();
405            key.hash(&mut hasher);
406            hasher.finish()
407        };
408        self.contains_key(hashed_key)
409    }
410
411    /// Contains tell you whether the key, as pre-computed digest form, is likely
412    /// part of the set, with false positive rate.
413    pub fn contains_key(&self, digest: u64) -> bool {
414        let hash = mixsplit(digest, self.seed);
415        let f = fingerprint(hash) as u8;
416        let r0 = hash as u32;
417        let r1 = hash.rotate_left(21) as u32;
418        let r2 = hash.rotate_left(42) as u32;
419        let h0 = reduce(r0, self.block_length) as usize;
420        let h1 = (reduce(r1, self.block_length) + self.block_length) as usize;
421        let h2 = (reduce(r2, self.block_length) + 2 * self.block_length) as usize;
422        f == (self.finger_prints[h0] ^ self.finger_prints[h1] ^ self.finger_prints[h2])
423    }
424
425    #[allow(dead_code)]
426    fn get_hasher(&self) -> H::Hasher {
427        self.hash_builder.build_hasher()
428    }
429}
430
431impl<H> Xor8<H>
432where
433    H: BuildHasher,
434{
435    fn geth0h1h2(&self, k: u64) -> Hashes {
436        let h = mixsplit(k, self.seed);
437        Hashes {
438            h,
439            h0: reduce(h as u32, self.block_length),
440            h1: reduce(h.rotate_left(21) as u32, self.block_length),
441            h2: reduce(h.rotate_left(42) as u32, self.block_length),
442        }
443    }
444
445    fn geth0(&self, hash: u64) -> u32 {
446        let r0 = hash as u32;
447        reduce(r0, self.block_length)
448    }
449
450    fn geth1(&self, hash: u64) -> u32 {
451        let r1 = hash.rotate_left(21) as u32;
452        reduce(r1, self.block_length)
453    }
454
455    fn geth2(&self, hash: u64) -> u32 {
456        let r2 = hash.rotate_left(42) as u32;
457        reduce(r2, self.block_length)
458    }
459}
460
461/// Implements serialization and de-serialization logic for Xor8. This is still work
462/// in progress, refer to issue: <https://github.com/bnclabs/xorfilter/issues/1>
463/// in github.
464///
465/// TODO: <https://github.com/bnclabs/xorfilter/issues/1>
466impl<H> Xor8<H>
467where
468    H: Into<Vec<u8>> + From<Vec<u8>> + BuildHasher,
469{
470    /// File signature write on first 4 bytes of file.
471    /// ^ stands for xor
472    /// TL stands for filter
473    /// 1 stands for version 1
474    /// 2 stands for version 2
475    const SIGNATURE_V1: [u8; 4] = [b'^', b'T', b'L', 1];
476    const SIGNATURE_V2: [u8; 4] = [b'^', b'T', b'L', 2];
477
478    /// METADATA_LENGTH is size that required to write size of all the
479    /// metadata of the serialized filter.
480    // signature length + seed length + block-length +
481    //      fingerprint length + hasher-builder length + fingerprint + hash-builder
482    const METADATA_LENGTH: usize = 4 + 8 + 4 + 4 + 4;
483
484    /// Write to file in binary format
485    /// TODO Add chechsum of finger_prints into file headers
486    pub fn write_file(&self, path: &ffi::OsStr) -> io::Result<usize>
487    where
488        H: Clone,
489    {
490        let mut f = fs::File::create(path)?;
491        let buf = self.to_bytes();
492        f.write_all(&buf)?;
493        Ok(buf.len())
494    }
495
496    /// Read from file in binary format
497    pub fn read_file(path: &ffi::OsStr) -> io::Result<Self>
498    where
499        H: Default,
500    {
501        let mut f = fs::File::open(path)?;
502        let mut data = Vec::new();
503        f.read_to_end(&mut data)?;
504        Self::from_bytes(data)
505    }
506
507    pub fn to_bytes(&self) -> Vec<u8>
508    where
509        H: Clone,
510    {
511        let capacity = Self::METADATA_LENGTH + self.finger_prints.len();
512        let mut buf: Vec<u8> = Vec::with_capacity(capacity);
513        buf.extend_from_slice(&Xor8::<H>::SIGNATURE_V2);
514        buf.extend_from_slice(&self.seed.to_be_bytes());
515        buf.extend_from_slice(&self.block_length.to_be_bytes());
516        buf.extend_from_slice(&(self.finger_prints.len() as u32).to_be_bytes());
517
518        let hb_binary: Vec<u8> = self.hash_builder.clone().into();
519        buf.extend_from_slice(&(hb_binary.len() as u32).to_be_bytes());
520
521        buf.extend_from_slice(&self.finger_prints);
522        buf.extend_from_slice(&hb_binary);
523        buf
524    }
525
526    pub fn from_bytes(buf: Vec<u8>) -> io::Result<Self>
527    where
528        H: Default,
529    {
530        use std::io::Error;
531
532        let mut n = 0;
533
534        // validate the buf first.
535        if Self::METADATA_LENGTH > buf.len() {
536            return Err(Error::new(ErrorKind::InvalidData, "invalid byte slice"));
537        }
538
539        // check the signature
540        if buf[n..4] == Xor8::<H>::SIGNATURE_V1 {
541            return Self::from_bytes_v1(buf);
542        } else if buf[n..4] != Xor8::<H>::SIGNATURE_V2 {
543            return Err(Error::new(
544                ErrorKind::InvalidData,
545                "File signature incorrect",
546            ));
547        }
548
549        n += 4;
550        // fetch the seed
551        let seed = u64::from_be_bytes(buf[n..n + 8].try_into().unwrap());
552        n += 8;
553        // fetch block_length
554        let block_length = u32::from_be_bytes(buf[n..n + 4].try_into().unwrap());
555        n += 4;
556        // fetch fingerprint length
557        let fp_len = u32::from_be_bytes(buf[n..n + 4].try_into().unwrap()) as usize;
558        n += 4;
559        // fetch hash-serizalized length
560        let hb_len = u32::from_be_bytes(buf[n..n + 4].try_into().unwrap()) as usize;
561        n += 4;
562
563        if buf[n..].len() < (fp_len + hb_len) {
564            return Err(Error::new(ErrorKind::InvalidData, "invalid byte slice"));
565        }
566
567        // fetch the finger print
568        let finger_prints = buf[n..n + fp_len].to_vec();
569        n += fp_len;
570        // fetch the hash_builder
571        let hash_builder: H = buf[n..n + hb_len].to_vec().into();
572
573        Ok(Xor8 {
574            keys: None,
575            hash_builder,
576            seed,
577            block_length,
578            finger_prints,
579        })
580    }
581
582    fn from_bytes_v1(buf: Vec<u8>) -> io::Result<Self>
583    where
584        H: Default,
585    {
586        use std::io::Error;
587
588        // validate the buf first.
589        if Self::METADATA_LENGTH > buf.len() {
590            return Err(Error::new(ErrorKind::InvalidData, "invalid byte slice"));
591        }
592        if buf[..4] != Xor8::<H>::SIGNATURE_V1 {
593            return Err(Error::new(
594                ErrorKind::InvalidData,
595                "File signature incorrect",
596            ));
597        }
598        let fp_len = u32::from_be_bytes(buf[16..20].try_into().unwrap()) as usize;
599        if buf[20..].len() < fp_len {
600            return Err(Error::new(ErrorKind::InvalidData, "invalid byte slice"));
601        }
602        Ok(Xor8 {
603            keys: None,
604            hash_builder: H::default(),
605            seed: u64::from_be_bytes(buf[4..12].try_into().unwrap()),
606            block_length: u32::from_be_bytes(buf[12..16].try_into().unwrap()),
607            finger_prints: buf[20..].to_vec(),
608        })
609    }
610}
611
612#[cfg(test)]
613#[path = "xor8_test.rs"]
614mod xor8_test;