cuckoofilter/lib.rs
1//! Cuckoo filter probabilistic data structure for membership testing and cardinality counting.
2//!
3//! # Usage
4//!
5//! This crate is [on crates.io](https://crates.io/crates/cuckoofilter) and can be
6//! used by adding `cuckoofilter` to the dependencies in your project's `Cargo.toml`.
7//!
8//! ```toml
9//! [dependencies]
10//! cuckoofilter = "0.3"
11//! ```
12//!
13//! And this in your crate root:
14//!
15//! ```rust
16//! extern crate cuckoofilter;
17//! ```
18
19#![cfg_attr(feature = "dev", feature(plugin))]
20#![cfg_attr(feature = "dev", plugin(clippy))]
21
22mod bucket;
23mod util;
24
25use crate::bucket::{Bucket, Fingerprint, BUCKET_SIZE, FINGERPRINT_SIZE};
26use crate::util::{get_alt_index, get_fai, FaI};
27
28use std::cmp;
29use std::collections::hash_map::DefaultHasher;
30use std::error::Error as StdError;
31use std::fmt;
32use std::hash::{Hash, Hasher};
33use std::iter::repeat;
34use std::marker::PhantomData;
35use std::mem;
36
37use rand::Rng;
38#[cfg(feature = "serde_support")]
39use serde_derive::{Deserialize, Serialize};
40
41/// If insertion fails, we will retry this many times.
42pub const MAX_REBUCKET: u32 = 500;
43
44/// The default number of buckets.
45pub const DEFAULT_CAPACITY: usize = (1 << 20) - 1;
46
47#[derive(Debug)]
48pub enum CuckooError {
49 NotEnoughSpace,
50}
51
52impl fmt::Display for CuckooError {
53 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
54 f.write_str("NotEnoughSpace")
55 }
56}
57
58impl StdError for CuckooError {
59 fn description(&self) -> &str {
60 "Not enough space to store this item, rebucketing failed."
61 }
62}
63
64/// A cuckoo filter class exposes a Bloomier filter interface,
65/// providing methods of add, delete, contains.
66///
67/// # Examples
68///
69/// ```
70/// extern crate cuckoofilter;
71///
72/// let words = vec!["foo", "bar", "xylophone", "milagro"];
73/// let mut cf = cuckoofilter::CuckooFilter::new();
74///
75/// let mut insertions = 0;
76/// for s in &words {
77/// if cf.test_and_add(s).unwrap() {
78/// insertions += 1;
79/// }
80/// }
81///
82/// assert_eq!(insertions, words.len());
83/// assert_eq!(cf.len(), words.len());
84///
85/// // Re-add the first element.
86/// cf.add(words[0]);
87///
88/// assert_eq!(cf.len(), words.len() + 1);
89///
90/// for s in &words {
91/// cf.delete(s);
92/// }
93///
94/// assert_eq!(cf.len(), 1);
95/// assert!(!cf.is_empty());
96///
97/// cf.delete(words[0]);
98///
99/// assert_eq!(cf.len(), 0);
100/// assert!(cf.is_empty());
101///
102/// ```
103pub struct CuckooFilter<H> {
104 buckets: Box<[Bucket]>,
105 len: usize,
106 _hasher: std::marker::PhantomData<H>,
107}
108
109impl Default for CuckooFilter<DefaultHasher> {
110 fn default() -> Self {
111 Self::new()
112 }
113}
114
115impl CuckooFilter<DefaultHasher> {
116 /// Construct a CuckooFilter with default capacity and hasher.
117 pub fn new() -> Self {
118 Self::with_capacity(DEFAULT_CAPACITY)
119 }
120}
121
122impl<H> CuckooFilter<H>
123where
124 H: Hasher + Default,
125{
126 /// Constructs a Cuckoo Filter with a given max capacity
127 pub fn with_capacity(cap: usize) -> Self {
128 let capacity = cmp::max(1, cap.next_power_of_two() / BUCKET_SIZE);
129
130 Self {
131 buckets: repeat(Bucket::new())
132 .take(capacity)
133 .collect::<Vec<_>>()
134 .into_boxed_slice(),
135 len: 0,
136 _hasher: PhantomData,
137 }
138 }
139
140 /// Checks if `data` is in the filter.
141 pub fn contains<T: ?Sized + Hash>(&self, data: &T) -> bool {
142 let FaI { fp, i1, i2 } = get_fai::<T, H>(data);
143 let len = self.buckets.len();
144 self.buckets[i1 % len]
145 .get_fingerprint_index(fp)
146 .or_else(|| self.buckets[i2 % len].get_fingerprint_index(fp))
147 .is_some()
148 }
149
150 /// Adds `data` to the filter. Returns `Ok` if the insertion was successful,
151 /// but could fail with a `NotEnoughSpace` error, especially when the filter
152 /// is nearing its capacity.
153 /// Note that while you can put any hashable type in the same filter, beware
154 /// for side effects like that the same number can have diferent hashes
155 /// depending on the type.
156 /// So for the filter, 4711i64 isn't the same as 4711u64.
157 ///
158 /// **Note:** When this returns `NotEnoughSpace`, the element given was
159 /// actually added to the filter, but some random *other* element was
160 /// removed. This might improve in the future.
161 pub fn add<T: ?Sized + Hash>(&mut self, data: &T) -> Result<(), CuckooError> {
162 let fai = get_fai::<T, H>(data);
163 if self.put(fai.fp, fai.i1) || self.put(fai.fp, fai.i2) {
164 return Ok(());
165 }
166 let len = self.buckets.len();
167 let mut rng = rand::thread_rng();
168 let mut i = fai.random_index(&mut rng);
169 let mut fp = fai.fp;
170 for _ in 0..MAX_REBUCKET {
171 let other_fp;
172 {
173 let loc = &mut self.buckets[i % len].buffer[rng.gen_range(0, BUCKET_SIZE)];
174 other_fp = *loc;
175 *loc = fp;
176 i = get_alt_index::<H>(other_fp, i);
177 }
178 if self.put(other_fp, i) {
179 return Ok(());
180 }
181 fp = other_fp;
182 }
183 // fp is dropped here, which means that the last item that was
184 // rebucketed gets removed from the filter.
185 // TODO: One could introduce a single-item cache for this element,
186 // check this cache in all methods additionally to the actual filter,
187 // and return NotEnoughSpace if that cache is already in use.
188 // This would complicate the code, but stop random elements from
189 // getting removed and result in nicer behaviour for the user.
190 Err(CuckooError::NotEnoughSpace)
191 }
192
193 /// Adds `data` to the filter if it does not exist in the filter yet.
194 /// Returns `Ok(true)` if `data` was not yet present in the filter and added
195 /// successfully.
196 pub fn test_and_add<T: ?Sized + Hash>(&mut self, data: &T) -> Result<bool, CuckooError> {
197 if self.contains(data) {
198 Ok(false)
199 } else {
200 self.add(data).map(|_| true)
201 }
202 }
203
204 /// Number of items in the filter.
205 pub fn len(&self) -> usize {
206 self.len
207 }
208
209 /// Exports fingerprints in all buckets, along with the filter's length for storage.
210 /// The filter can be recovered by passing the `ExportedCuckooFilter` struct to the
211 /// `from` method of `CuckooFilter`.
212 pub fn export(&self) -> ExportedCuckooFilter {
213 self.into()
214 }
215
216 /// Number of bytes the filter occupies in memory
217 pub fn memory_usage(&self) -> usize {
218 mem::size_of_val(self) + self.buckets.len() * mem::size_of::<Bucket>()
219 }
220
221 /// Check if filter is empty
222 pub fn is_empty(&self) -> bool {
223 self.len == 0
224 }
225
226 /// Deletes `data` from the filter. Returns true if `data` existed in the
227 /// filter before.
228 pub fn delete<T: ?Sized + Hash>(&mut self, data: &T) -> bool {
229 let FaI { fp, i1, i2 } = get_fai::<T, H>(data);
230 self.remove(fp, i1) || self.remove(fp, i2)
231 }
232
233 /// Extracts fingerprint values from all buckets, used for exporting the filters data.
234 fn values(&self) -> Vec<u8> {
235 self.buckets
236 .iter()
237 .flat_map(|b| b.get_fingerprint_data().into_iter())
238 .collect()
239 }
240
241 /// Removes the item with the given fingerprint from the bucket indexed by i.
242 fn remove(&mut self, fp: Fingerprint, i: usize) -> bool {
243 let len = self.buckets.len();
244 if self.buckets[i % len].delete(fp) {
245 self.len -= 1;
246 true
247 } else {
248 false
249 }
250 }
251
252 fn put(&mut self, fp: Fingerprint, i: usize) -> bool {
253 let len = self.buckets.len();
254 if self.buckets[i % len].insert(fp) {
255 self.len += 1;
256 true
257 } else {
258 false
259 }
260 }
261}
262
263/// A minimal representation of the CuckooFilter which can be transfered or stored, then recovered at a later stage.
264#[derive(Debug)]
265#[cfg_attr(feature = "serde_support", derive(Deserialize, Serialize))]
266pub struct ExportedCuckooFilter {
267 #[cfg_attr(feature = "serde_support", serde(with = "serde_bytes"))]
268 pub values: Vec<u8>,
269 pub length: usize,
270}
271
272impl<H> From<ExportedCuckooFilter> for CuckooFilter<H> {
273 /// Converts a simplified representation of a filter used for export to a
274 /// fully functioning version.
275 ///
276 /// # Contents
277 ///
278 /// * `values` - A serialized version of the `CuckooFilter`'s memory, where the
279 /// fingerprints in each bucket are chained one after another, then in turn all
280 /// buckets are chained together.
281 /// * `length` - The number of valid fingerprints inside the `CuckooFilter`.
282 /// This value is used as a time saving method, otherwise all fingerprints
283 /// would need to be checked for equivalence against the null pattern.
284 fn from(exported: ExportedCuckooFilter) -> Self {
285 // Assumes that the `BUCKET_SIZE` and `FINGERPRINT_SIZE` constants do not change.
286 Self {
287 buckets: exported
288 .values
289 .chunks(BUCKET_SIZE * FINGERPRINT_SIZE)
290 .map(Bucket::from)
291 .collect::<Vec<_>>()
292 .into_boxed_slice(),
293 len: exported.length,
294 _hasher: PhantomData,
295 }
296 }
297}
298
299impl<H> From<&CuckooFilter<H>> for ExportedCuckooFilter
300where
301 H: Hasher + Default,
302{
303 /// Converts a `CuckooFilter` into a simplified version which can be serialized and stored
304 /// for later use.
305 fn from(cuckoo: &CuckooFilter<H>) -> Self {
306 Self {
307 values: cuckoo.values(),
308 length: cuckoo.len(),
309 }
310 }
311}