Skip to main content

bloom_lib/
cuckoo.rs

1//! A Cuckoo filter: approximate set membership with deletion support.
2
3use core::{hash::BuildHasher, marker::PhantomData};
4
5use alloc::{vec, vec::Vec};
6
7use crate::{
8    hash::{mix64, DefaultHashBuilder},
9    Error,
10};
11
12/// Fingerprints per bucket. Four is the value analysed by the original paper
13/// for a good balance of load factor and lookup cost.
14const BUCKET_SIZE: usize = 4;
15
16/// Maximum number of relocations attempted before declaring an insertion
17/// failure. The paper shows 500 keeps failure probability negligible below the
18/// load threshold.
19const MAX_KICKS: usize = 500;
20
21/// Sentinel fingerprint value meaning "empty slot". Real fingerprints are
22/// remapped away from zero, so this never collides with stored data.
23const EMPTY: u16 = 0;
24
25/// The held-out element produced when a relocation chain exhausts its budget.
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
27#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
28struct Victim {
29    fingerprint: u16,
30    index: usize,
31}
32
33/// A probabilistic set that supports deletion.
34///
35/// Like a [`BloomFilter`](crate::BloomFilter), a Cuckoo filter answers
36/// approximate membership queries in a fraction of the memory an exact set would
37/// need, and never reports a false negative for an item that is present.
38/// Unlike a Bloom filter it supports [`remove`](Self::remove): items can be
39/// deleted without rebuilding the structure.
40///
41/// The filter stores a short *fingerprint* of each item (16 bits here) in one of
42/// two candidate buckets, relocating existing fingerprints as needed to make
43/// room — the "cuckoo" behaviour. The false-positive rate is fixed by the
44/// fingerprint width at roughly 0.012%; for a tunable rate without deletion, use
45/// a [`BloomFilter`](crate::BloomFilter).
46///
47/// The filter is generic over the item type `T` and a
48/// [`BuildHasher`](core::hash::BuildHasher) `S`, defaulting to the deterministic
49/// [`DefaultHashBuilder`](crate::hash::DefaultHashBuilder).
50///
51/// # Examples
52///
53/// ```
54/// use bloom_lib::CuckooFilter;
55///
56/// let mut filter = CuckooFilter::new(1_000).unwrap();
57///
58/// filter.insert("alice").unwrap();
59/// assert!(filter.contains("alice"));
60///
61/// assert!(filter.remove("alice"));
62/// assert!(!filter.contains("alice"));
63/// ```
64#[derive(Debug, Clone)]
65#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
66pub struct CuckooFilter<T: ?Sized, S = DefaultHashBuilder> {
67    buckets: Vec<u16>,
68    mask: usize,
69    len: usize,
70    victim: Option<Victim>,
71    rng_state: u64,
72    #[cfg_attr(feature = "serde", serde(skip))]
73    hasher: S,
74    #[cfg_attr(feature = "serde", serde(skip))]
75    _marker: PhantomData<fn(&T)>,
76}
77
78impl<T: ?Sized> CuckooFilter<T, DefaultHashBuilder> {
79    /// Creates a filter able to hold roughly `capacity` items, using the default
80    /// hasher.
81    ///
82    /// The number of buckets is rounded up to a power of two with headroom, so
83    /// the reported [`capacity`](Self::capacity) is at least `capacity` and
84    /// usually larger. Inserting close to the reported capacity may begin to
85    /// fail with [`Error::CapacityExceeded`]; size with margin for write-heavy
86    /// workloads.
87    ///
88    /// # Parameters
89    ///
90    /// - `capacity`: the expected number of distinct items. Must be non-zero.
91    ///
92    /// # Errors
93    ///
94    /// Returns [`Error::InvalidParameter`] if `capacity` is zero.
95    ///
96    /// # Examples
97    ///
98    /// ```
99    /// use bloom_lib::CuckooFilter;
100    ///
101    /// let filter = CuckooFilter::<&str>::new(10_000).unwrap();
102    /// assert!(filter.capacity() >= 10_000);
103    /// assert!(filter.is_empty());
104    /// ```
105    pub fn new(capacity: usize) -> Result<Self, Error> {
106        Self::with_hasher(capacity, DefaultHashBuilder)
107    }
108}
109
110impl<T: ?Sized, S: BuildHasher> CuckooFilter<T, S> {
111    /// Creates a filter for roughly `capacity` items with a caller-supplied
112    /// hasher.
113    ///
114    /// # Errors
115    ///
116    /// Returns [`Error::InvalidParameter`] if `capacity` is zero.
117    ///
118    /// # Examples
119    ///
120    /// ```
121    /// # #[cfg(feature = "std")] {
122    /// use std::collections::hash_map::RandomState;
123    /// use bloom_lib::CuckooFilter;
124    ///
125    /// let filter: CuckooFilter<&str, RandomState> =
126    ///     CuckooFilter::with_hasher(1_000, RandomState::new()).unwrap();
127    /// # }
128    /// ```
129    pub fn with_hasher(capacity: usize, hasher: S) -> Result<Self, Error> {
130        if capacity == 0 {
131            return Err(Error::InvalidParameter {
132                param: "capacity",
133                reason: "must be greater than zero",
134            });
135        }
136
137        let num_buckets = capacity_to_buckets(capacity);
138        Ok(Self {
139            buckets: vec![EMPTY; num_buckets * BUCKET_SIZE],
140            mask: num_buckets - 1,
141            len: 0,
142            victim: None,
143            // Non-zero xorshift seed derived from the table size.
144            rng_state: 0x9E37_79B9_7F4A_7C15 ^ num_buckets as u64,
145            hasher,
146            _marker: PhantomData,
147        })
148    }
149
150    /// Inserts `item` into the filter.
151    ///
152    /// Duplicates are permitted: inserting the same item twice stores two
153    /// fingerprints and requires two [`remove`](Self::remove) calls to fully
154    /// evict. This matches the semantics of the original Cuckoo filter and keeps
155    /// `remove` exact with respect to insertions.
156    ///
157    /// # Errors
158    ///
159    /// Returns [`Error::CapacityExceeded`] when the filter is too full to place
160    /// the item within its relocation budget. The filter is unchanged in that
161    /// case and remains usable for lookups and deletions.
162    ///
163    /// # Examples
164    ///
165    /// ```
166    /// use bloom_lib::CuckooFilter;
167    ///
168    /// let mut filter = CuckooFilter::new(100).unwrap();
169    /// filter.insert(&42u32).unwrap();
170    /// assert!(filter.contains(&42u32));
171    /// ```
172    pub fn insert(&mut self, item: &T) -> Result<(), Error>
173    where
174        T: core::hash::Hash,
175    {
176        // A held-out victim means the table is already at its practical limit.
177        if self.victim.is_some() {
178            return Err(Error::CapacityExceeded);
179        }
180
181        let hash = self.hasher.hash_one(item);
182        let fingerprint = fingerprint_of(hash);
183        let i1 = (hash as usize) & self.mask;
184        let i2 = self.alt_index(i1, fingerprint);
185
186        if self.try_put(i1, fingerprint) || self.try_put(i2, fingerprint) {
187            self.len += 1;
188            return Ok(());
189        }
190
191        // Both candidate buckets are full: relocate a resident fingerprint.
192        let mut index = if self.next_rng() & 1 == 0 { i1 } else { i2 };
193        let mut moving = fingerprint;
194        for _ in 0..MAX_KICKS {
195            let slot = (self.next_rng() as usize) & (BUCKET_SIZE - 1);
196            let pos = index * BUCKET_SIZE + slot;
197            core::mem::swap(&mut moving, &mut self.buckets[pos]);
198            index = self.alt_index(index, moving);
199            if self.try_put(index, moving) {
200                self.len += 1;
201                return Ok(());
202            }
203        }
204
205        // Budget exhausted. `moving` is now homeless; stash it in the victim
206        // slot so no previously-inserted element is lost, and count the new
207        // element we successfully placed during the first swap.
208        self.victim = Some(Victim {
209            fingerprint: moving,
210            index,
211        });
212        self.len += 1;
213        Ok(())
214    }
215
216    /// Tests whether `item` is in the filter.
217    ///
218    /// Returns `false` only if the item was definitely never inserted (or has
219    /// since been removed). A return of `true` means the item is *probably*
220    /// present, with a false-positive probability of roughly 0.012%.
221    ///
222    /// # Examples
223    ///
224    /// ```
225    /// use bloom_lib::CuckooFilter;
226    ///
227    /// let mut filter = CuckooFilter::new(100).unwrap();
228    /// filter.insert("present").unwrap();
229    /// assert!(filter.contains("present"));
230    /// assert!(!filter.contains("absent"));
231    /// ```
232    #[must_use]
233    pub fn contains(&self, item: &T) -> bool
234    where
235        T: core::hash::Hash,
236    {
237        let hash = self.hasher.hash_one(item);
238        let fingerprint = fingerprint_of(hash);
239        let i1 = (hash as usize) & self.mask;
240        let i2 = self.alt_index(i1, fingerprint);
241        self.bucket_contains(i1, fingerprint)
242            || self.bucket_contains(i2, fingerprint)
243            || self.victim_matches(fingerprint, i1, i2)
244    }
245
246    /// Removes one occurrence of `item`, returning `true` if it was present.
247    ///
248    /// Only a single stored fingerprint is removed per call, mirroring `insert`.
249    /// Because fingerprints collide, removing an item that was never inserted
250    /// but collides with a present one can delete the wrong fingerprint; this is
251    /// the same false-positive trade-off that governs `contains`. Only delete
252    /// items you previously inserted.
253    ///
254    /// # Examples
255    ///
256    /// ```
257    /// use bloom_lib::CuckooFilter;
258    ///
259    /// let mut filter = CuckooFilter::new(100).unwrap();
260    /// filter.insert("temp").unwrap();
261    /// assert!(filter.remove("temp"));
262    /// assert!(!filter.remove("temp")); // already gone
263    /// ```
264    pub fn remove(&mut self, item: &T) -> bool
265    where
266        T: core::hash::Hash,
267    {
268        let hash = self.hasher.hash_one(item);
269        let fingerprint = fingerprint_of(hash);
270        let i1 = (hash as usize) & self.mask;
271        let i2 = self.alt_index(i1, fingerprint);
272
273        if self.try_take(i1, fingerprint) || self.try_take(i2, fingerprint) {
274            self.len -= 1;
275            self.reinsert_victim();
276            return true;
277        }
278
279        if self.victim_matches(fingerprint, i1, i2) {
280            self.victim = None;
281            self.len -= 1;
282            return true;
283        }
284
285        false
286    }
287
288    /// Removes every item, retaining the allocation.
289    ///
290    /// # Examples
291    ///
292    /// ```
293    /// use bloom_lib::CuckooFilter;
294    ///
295    /// let mut filter = CuckooFilter::new(100).unwrap();
296    /// filter.insert("x").unwrap();
297    /// filter.clear();
298    /// assert!(filter.is_empty());
299    /// ```
300    pub fn clear(&mut self) {
301        self.buckets.iter_mut().for_each(|slot| *slot = EMPTY);
302        self.len = 0;
303        self.victim = None;
304    }
305
306    /// The number of fingerprints currently stored.
307    ///
308    /// # Examples
309    ///
310    /// ```
311    /// use bloom_lib::CuckooFilter;
312    ///
313    /// let mut filter = CuckooFilter::new(100).unwrap();
314    /// filter.insert("a").unwrap();
315    /// filter.insert("b").unwrap();
316    /// assert_eq!(filter.len(), 2);
317    /// ```
318    #[inline]
319    #[must_use]
320    pub fn len(&self) -> usize {
321        self.len
322    }
323
324    /// Returns `true` if the filter holds no items.
325    #[inline]
326    #[must_use]
327    pub fn is_empty(&self) -> bool {
328        self.len == 0
329    }
330
331    /// The maximum number of fingerprints the table can hold.
332    ///
333    /// This is `num_buckets * 4`. Practical fill tops out a little below this
334    /// value before insertions start to fail.
335    #[inline]
336    #[must_use]
337    pub fn capacity(&self) -> usize {
338        self.buckets.len()
339    }
340
341    /// The current load factor in `[0.0, 1.0]`.
342    ///
343    /// # Examples
344    ///
345    /// ```
346    /// use bloom_lib::CuckooFilter;
347    ///
348    /// let filter = CuckooFilter::<u32>::new(100).unwrap();
349    /// assert_eq!(filter.load_factor(), 0.0);
350    /// ```
351    #[must_use]
352    pub fn load_factor(&self) -> f64 {
353        self.len as f64 / self.buckets.len() as f64
354    }
355
356    /// Computes the alternate bucket index for a fingerprint via the XOR trick.
357    ///
358    /// Because XOR is its own inverse, `alt_index(alt_index(i, fp), fp) == i`,
359    /// which lets `contains` and relocation recover either candidate from the
360    /// other without storing it.
361    #[inline]
362    fn alt_index(&self, index: usize, fingerprint: u16) -> usize {
363        index ^ ((mix64(u64::from(fingerprint)) as usize) & self.mask)
364    }
365
366    /// Attempts to place `fingerprint` in a free slot of bucket `index`.
367    #[inline]
368    fn try_put(&mut self, index: usize, fingerprint: u16) -> bool {
369        let base = index * BUCKET_SIZE;
370        for slot in &mut self.buckets[base..base + BUCKET_SIZE] {
371            if *slot == EMPTY {
372                *slot = fingerprint;
373                return true;
374            }
375        }
376        false
377    }
378
379    /// Attempts to remove one `fingerprint` from bucket `index`.
380    #[inline]
381    fn try_take(&mut self, index: usize, fingerprint: u16) -> bool {
382        let base = index * BUCKET_SIZE;
383        for slot in &mut self.buckets[base..base + BUCKET_SIZE] {
384            if *slot == fingerprint {
385                *slot = EMPTY;
386                return true;
387            }
388        }
389        false
390    }
391
392    /// Returns `true` if bucket `index` holds `fingerprint`.
393    #[inline]
394    fn bucket_contains(&self, index: usize, fingerprint: u16) -> bool {
395        let base = index * BUCKET_SIZE;
396        self.buckets[base..base + BUCKET_SIZE].contains(&fingerprint)
397    }
398
399    /// Returns `true` if the held-out victim matches the query.
400    #[inline]
401    fn victim_matches(&self, fingerprint: u16, i1: usize, i2: usize) -> bool {
402        matches!(
403            self.victim,
404            Some(Victim { fingerprint: vf, index: vi })
405                if vf == fingerprint && (vi == i1 || vi == i2)
406        )
407    }
408
409    /// After a deletion frees a slot, try to settle the victim back into the
410    /// table so it stops blocking future insertions.
411    fn reinsert_victim(&mut self) {
412        let Some(victim) = self.victim else {
413            return;
414        };
415        let alt = self.alt_index(victim.index, victim.fingerprint);
416        if self.try_put(victim.index, victim.fingerprint) || self.try_put(alt, victim.fingerprint) {
417            self.victim = None;
418        }
419    }
420
421    /// Advances the internal xorshift PRNG used to pick relocation targets.
422    #[inline]
423    fn next_rng(&mut self) -> u64 {
424        let mut x = self.rng_state;
425        x ^= x << 13;
426        x ^= x >> 7;
427        x ^= x << 17;
428        self.rng_state = x;
429        x
430    }
431}
432
433/// Derives a non-zero 16-bit fingerprint from the high bits of a 64-bit hash.
434#[inline]
435fn fingerprint_of(hash: u64) -> u16 {
436    let fingerprint = (hash >> 48) as u16;
437    if fingerprint == EMPTY {
438        1
439    } else {
440        fingerprint
441    }
442}
443
444/// Rounds a capacity up to a power-of-two bucket count with ~5% headroom.
445fn capacity_to_buckets(capacity: usize) -> usize {
446    let target_load = 0.95;
447    let raw = libm::ceil(capacity as f64 / (BUCKET_SIZE as f64 * target_load)) as usize;
448    raw.max(1).next_power_of_two()
449}
450
451#[cfg(test)]
452mod tests {
453    #![allow(clippy::unwrap_used)]
454
455    use super::*;
456
457    #[test]
458    fn test_new_rejects_zero_capacity() {
459        assert!(matches!(
460            CuckooFilter::<&str>::new(0),
461            Err(Error::InvalidParameter { .. })
462        ));
463    }
464
465    #[test]
466    fn test_insert_contains_remove() {
467        let mut filter = CuckooFilter::new(1_000).unwrap();
468        filter.insert("alice").unwrap();
469        assert!(filter.contains("alice"));
470        assert!(filter.remove("alice"));
471        assert!(!filter.contains("alice"));
472        assert!(!filter.remove("alice"));
473    }
474
475    #[test]
476    fn test_no_false_negatives() {
477        let mut filter = CuckooFilter::new(5_000).unwrap();
478        for i in 0..2_000u32 {
479            filter.insert(&i).unwrap();
480        }
481        for i in 0..2_000u32 {
482            assert!(filter.contains(&i), "inserted item {i} reported absent");
483        }
484    }
485
486    #[test]
487    fn test_duplicates_need_matching_removes() {
488        let mut filter = CuckooFilter::new(100).unwrap();
489        filter.insert("dup").unwrap();
490        filter.insert("dup").unwrap();
491        assert_eq!(filter.len(), 2);
492        assert!(filter.remove("dup"));
493        assert!(filter.contains("dup")); // one copy remains
494        assert!(filter.remove("dup"));
495        assert!(!filter.contains("dup"));
496    }
497
498    #[test]
499    fn test_len_and_clear() {
500        let mut filter = CuckooFilter::new(100).unwrap();
501        assert!(filter.is_empty());
502        for i in 0..10u32 {
503            filter.insert(&i).unwrap();
504        }
505        assert_eq!(filter.len(), 10);
506        filter.clear();
507        assert!(filter.is_empty());
508        assert_eq!(filter.len(), 0);
509    }
510
511    #[test]
512    fn test_capacity_has_headroom() {
513        let filter = CuckooFilter::<u32>::new(1_000).unwrap();
514        assert!(filter.capacity() >= 1_000);
515        // Power-of-two bucket count means capacity is a multiple of 4.
516        assert_eq!(filter.capacity() % BUCKET_SIZE, 0);
517    }
518
519    #[test]
520    fn test_fill_to_capacity_eventually_errors() {
521        // A deliberately small filter; insert distinct items until it refuses.
522        let mut filter = CuckooFilter::<u64>::new(64).unwrap();
523        let mut inserted = 0u64;
524        let mut hit_limit = false;
525        for i in 0..10_000u64 {
526            match filter.insert(&i) {
527                Ok(()) => inserted += 1,
528                Err(Error::CapacityExceeded) => {
529                    hit_limit = true;
530                    break;
531                }
532                Err(other) => panic!("unexpected error: {other:?}"),
533            }
534        }
535        assert!(hit_limit, "filter never reported CapacityExceeded");
536        // Everything that was accepted must still be found (no false negatives).
537        for i in 0..inserted {
538            assert!(filter.contains(&i), "accepted item {i} reported absent");
539        }
540    }
541
542    #[test]
543    fn test_load_factor() {
544        let mut filter = CuckooFilter::<u32>::new(100).unwrap();
545        let cap = filter.capacity();
546        for i in 0..(cap / 2) as u32 {
547            filter.insert(&i).unwrap();
548        }
549        let lf = filter.load_factor();
550        assert!((0.49..=0.51).contains(&lf), "unexpected load factor {lf}");
551    }
552}