cuckoofilter/
lib.rs

1//! Cuckoo filter probabilistic data structure for membership testing and cardinality counting.
2//!
3//! # Usage
4//!
5//! This crate is [on crates.io](https://crates.io/crates/cuckoofilter) and can be
6//! used by adding `cuckoofilter` to the dependencies in your project's `Cargo.toml`.
7//!
8//! ```toml
9//! [dependencies]
10//! cuckoofilter = "0.3"
11//! ```
12//!
13//! And this in your crate root:
14//!
15//! ```rust
16//! extern crate cuckoofilter;
17//! ```
18
19#![cfg_attr(feature = "dev", feature(plugin))]
20#![cfg_attr(feature = "dev", plugin(clippy))]
21
22mod bucket;
23mod util;
24
25use crate::bucket::{Bucket, Fingerprint, BUCKET_SIZE, FINGERPRINT_SIZE};
26use crate::util::{get_alt_index, get_fai, FaI};
27
28use std::cmp;
29use std::collections::hash_map::DefaultHasher;
30use std::error::Error as StdError;
31use std::fmt;
32use std::hash::{Hash, Hasher};
33use std::iter::repeat;
34use std::marker::PhantomData;
35use std::mem;
36
37use rand::Rng;
38#[cfg(feature = "serde_support")]
39use serde_derive::{Deserialize, Serialize};
40
41/// If insertion fails, we will retry this many times.
42pub const MAX_REBUCKET: u32 = 500;
43
44/// The default number of buckets.
45pub const DEFAULT_CAPACITY: usize = (1 << 20) - 1;
46
47#[derive(Debug)]
48pub enum CuckooError {
49    NotEnoughSpace,
50}
51
52impl fmt::Display for CuckooError {
53    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
54        f.write_str("NotEnoughSpace")
55    }
56}
57
58impl StdError for CuckooError {
59    fn description(&self) -> &str {
60        "Not enough space to store this item, rebucketing failed."
61    }
62}
63
64/// A cuckoo filter class exposes a Bloomier filter interface,
65/// providing methods of add, delete, contains.
66///
67/// # Examples
68///
69/// ```
70/// extern crate cuckoofilter;
71///
72/// let words = vec!["foo", "bar", "xylophone", "milagro"];
73/// let mut cf = cuckoofilter::CuckooFilter::new();
74///
75/// let mut insertions = 0;
76/// for s in &words {
77///     if cf.test_and_add(s).unwrap() {
78///         insertions += 1;
79///     }
80/// }
81///
82/// assert_eq!(insertions, words.len());
83/// assert_eq!(cf.len(), words.len());
84///
85/// // Re-add the first element.
86/// cf.add(words[0]);
87///
88/// assert_eq!(cf.len(), words.len() + 1);
89///
90/// for s in &words {
91///     cf.delete(s);
92/// }
93///
94/// assert_eq!(cf.len(), 1);
95/// assert!(!cf.is_empty());
96///
97/// cf.delete(words[0]);
98///
99/// assert_eq!(cf.len(), 0);
100/// assert!(cf.is_empty());
101///
102/// ```
103pub struct CuckooFilter<H> {
104    buckets: Box<[Bucket]>,
105    len: usize,
106    _hasher: std::marker::PhantomData<H>,
107}
108
109impl Default for CuckooFilter<DefaultHasher> {
110    fn default() -> Self {
111        Self::new()
112    }
113}
114
115impl CuckooFilter<DefaultHasher> {
116    /// Construct a CuckooFilter with default capacity and hasher.
117    pub fn new() -> Self {
118        Self::with_capacity(DEFAULT_CAPACITY)
119    }
120}
121
122impl<H> CuckooFilter<H>
123where
124    H: Hasher + Default,
125{
126    /// Constructs a Cuckoo Filter with a given max capacity
127    pub fn with_capacity(cap: usize) -> Self {
128        let capacity = cmp::max(1, cap.next_power_of_two() / BUCKET_SIZE);
129
130        Self {
131            buckets: repeat(Bucket::new())
132                .take(capacity)
133                .collect::<Vec<_>>()
134                .into_boxed_slice(),
135            len: 0,
136            _hasher: PhantomData,
137        }
138    }
139
140    /// Checks if `data` is in the filter.
141    pub fn contains<T: ?Sized + Hash>(&self, data: &T) -> bool {
142        let FaI { fp, i1, i2 } = get_fai::<T, H>(data);
143        let len = self.buckets.len();
144        self.buckets[i1 % len]
145            .get_fingerprint_index(fp)
146            .or_else(|| self.buckets[i2 % len].get_fingerprint_index(fp))
147            .is_some()
148    }
149
150    /// Adds `data` to the filter. Returns `Ok` if the insertion was successful,
151    /// but could fail with a `NotEnoughSpace` error, especially when the filter
152    /// is nearing its capacity.
153    /// Note that while you can put any hashable type in the same filter, beware
154    /// for side effects like that the same number can have diferent hashes
155    /// depending on the type.
156    /// So for the filter, 4711i64 isn't the same as 4711u64.
157    ///
158    /// **Note:** When this returns `NotEnoughSpace`, the element given was
159    /// actually added to the filter, but some random *other* element was
160    /// removed. This might improve in the future.
161    pub fn add<T: ?Sized + Hash>(&mut self, data: &T) -> Result<(), CuckooError> {
162        let fai = get_fai::<T, H>(data);
163        if self.put(fai.fp, fai.i1) || self.put(fai.fp, fai.i2) {
164            return Ok(());
165        }
166        let len = self.buckets.len();
167        let mut rng = rand::thread_rng();
168        let mut i = fai.random_index(&mut rng);
169        let mut fp = fai.fp;
170        for _ in 0..MAX_REBUCKET {
171            let other_fp;
172            {
173                let loc = &mut self.buckets[i % len].buffer[rng.gen_range(0, BUCKET_SIZE)];
174                other_fp = *loc;
175                *loc = fp;
176                i = get_alt_index::<H>(other_fp, i);
177            }
178            if self.put(other_fp, i) {
179                return Ok(());
180            }
181            fp = other_fp;
182        }
183        // fp is dropped here, which means that the last item that was
184        // rebucketed gets removed from the filter.
185        // TODO: One could introduce a single-item cache for this element,
186        // check this cache in all methods additionally to the actual filter,
187        // and return NotEnoughSpace if that cache is already in use.
188        // This would complicate the code, but stop random elements from
189        // getting removed and result in nicer behaviour for the user.
190        Err(CuckooError::NotEnoughSpace)
191    }
192
193    /// Adds `data` to the filter if it does not exist in the filter yet.
194    /// Returns `Ok(true)` if `data` was not yet present in the filter and added
195    /// successfully.
196    pub fn test_and_add<T: ?Sized + Hash>(&mut self, data: &T) -> Result<bool, CuckooError> {
197        if self.contains(data) {
198            Ok(false)
199        } else {
200            self.add(data).map(|_| true)
201        }
202    }
203
204    /// Number of items in the filter.
205    pub fn len(&self) -> usize {
206        self.len
207    }
208
209    /// Exports fingerprints in all buckets, along with the filter's length for storage.
210    /// The filter can be recovered by passing the `ExportedCuckooFilter` struct to the
211    /// `from` method of `CuckooFilter`.
212    pub fn export(&self) -> ExportedCuckooFilter {
213        self.into()
214    }
215
216    /// Number of bytes the filter occupies in memory
217    pub fn memory_usage(&self) -> usize {
218        mem::size_of_val(self) + self.buckets.len() * mem::size_of::<Bucket>()
219    }
220
221    /// Check if filter is empty
222    pub fn is_empty(&self) -> bool {
223        self.len == 0
224    }
225
226    /// Deletes `data` from the filter. Returns true if `data` existed in the
227    /// filter before.
228    pub fn delete<T: ?Sized + Hash>(&mut self, data: &T) -> bool {
229        let FaI { fp, i1, i2 } = get_fai::<T, H>(data);
230        self.remove(fp, i1) || self.remove(fp, i2)
231    }
232
233    /// Extracts fingerprint values from all buckets, used for exporting the filters data.
234    fn values(&self) -> Vec<u8> {
235        self.buckets
236            .iter()
237            .flat_map(|b| b.get_fingerprint_data().into_iter())
238            .collect()
239    }
240
241    /// Removes the item with the given fingerprint from the bucket indexed by i.
242    fn remove(&mut self, fp: Fingerprint, i: usize) -> bool {
243        let len = self.buckets.len();
244        if self.buckets[i % len].delete(fp) {
245            self.len -= 1;
246            true
247        } else {
248            false
249        }
250    }
251
252    fn put(&mut self, fp: Fingerprint, i: usize) -> bool {
253        let len = self.buckets.len();
254        if self.buckets[i % len].insert(fp) {
255            self.len += 1;
256            true
257        } else {
258            false
259        }
260    }
261}
262
263/// A minimal representation of the CuckooFilter which can be transfered or stored, then recovered at a later stage.
264#[derive(Debug)]
265#[cfg_attr(feature = "serde_support", derive(Deserialize, Serialize))]
266pub struct ExportedCuckooFilter {
267    #[cfg_attr(feature = "serde_support", serde(with = "serde_bytes"))]
268    pub values: Vec<u8>,
269    pub length: usize,
270}
271
272impl<H> From<ExportedCuckooFilter> for CuckooFilter<H> {
273    /// Converts a simplified representation of a filter used for export to a
274    /// fully functioning version.
275    ///
276    /// # Contents
277    ///
278    /// * `values` - A serialized version of the `CuckooFilter`'s memory, where the
279    /// fingerprints in each bucket are chained one after another, then in turn all
280    /// buckets are chained together.
281    /// * `length` - The number of valid fingerprints inside the `CuckooFilter`.
282    /// This value is used as a time saving method, otherwise all fingerprints
283    /// would need to be checked for equivalence against the null pattern.
284    fn from(exported: ExportedCuckooFilter) -> Self {
285        // Assumes that the `BUCKET_SIZE` and `FINGERPRINT_SIZE` constants do not change.
286        Self {
287            buckets: exported
288                .values
289                .chunks(BUCKET_SIZE * FINGERPRINT_SIZE)
290                .map(Bucket::from)
291                .collect::<Vec<_>>()
292                .into_boxed_slice(),
293            len: exported.length,
294            _hasher: PhantomData,
295        }
296    }
297}
298
299impl<H> From<&CuckooFilter<H>> for ExportedCuckooFilter
300where
301    H: Hasher + Default,
302{
303    /// Converts a `CuckooFilter` into a simplified version which can be serialized and stored
304    /// for later use.
305    fn from(cuckoo: &CuckooFilter<H>) -> Self {
306        Self {
307            values: cuckoo.values(),
308            length: cuckoo.len(),
309        }
310    }
311}