bloom_lib/cuckoo.rs
1//! A Cuckoo filter: approximate set membership with deletion support.
2
3use core::{hash::BuildHasher, marker::PhantomData};
4
5use alloc::{vec, vec::Vec};
6
7use crate::{
8 hash::{mix64, DefaultHashBuilder},
9 Error,
10};
11
12/// Fingerprints per bucket. Four is the value analysed by the original paper
13/// for a good balance of load factor and lookup cost.
14const BUCKET_SIZE: usize = 4;
15
16/// Maximum number of relocations attempted before declaring an insertion
17/// failure. The paper shows 500 keeps failure probability negligible below the
18/// load threshold.
19const MAX_KICKS: usize = 500;
20
21/// Sentinel fingerprint value meaning "empty slot". Real fingerprints are
22/// remapped away from zero, so this never collides with stored data.
23const EMPTY: u16 = 0;
24
25/// The held-out element produced when a relocation chain exhausts its budget.
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
27#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
28struct Victim {
29 fingerprint: u16,
30 index: usize,
31}
32
33/// A probabilistic set that supports deletion.
34///
35/// Like a [`BloomFilter`](crate::BloomFilter), a Cuckoo filter answers
36/// approximate membership queries in a fraction of the memory an exact set would
37/// need, and never reports a false negative for an item that is present.
38/// Unlike a Bloom filter it supports [`remove`](Self::remove): items can be
39/// deleted without rebuilding the structure.
40///
41/// The filter stores a short *fingerprint* of each item (16 bits here) in one of
42/// two candidate buckets, relocating existing fingerprints as needed to make
43/// room — the "cuckoo" behaviour. The false-positive rate is fixed by the
44/// fingerprint width at roughly 0.012%; for a tunable rate without deletion, use
45/// a [`BloomFilter`](crate::BloomFilter).
46///
47/// The filter is generic over the item type `T` and a
48/// [`BuildHasher`](core::hash::BuildHasher) `S`, defaulting to the deterministic
49/// [`DefaultHashBuilder`](crate::hash::DefaultHashBuilder).
50///
51/// # Examples
52///
53/// ```
54/// use bloom_lib::CuckooFilter;
55///
56/// let mut filter = CuckooFilter::new(1_000).unwrap();
57///
58/// filter.insert("alice").unwrap();
59/// assert!(filter.contains("alice"));
60///
61/// assert!(filter.remove("alice"));
62/// assert!(!filter.contains("alice"));
63/// ```
64#[derive(Debug, Clone)]
65#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
66pub struct CuckooFilter<T: ?Sized, S = DefaultHashBuilder> {
67 buckets: Vec<u16>,
68 mask: usize,
69 len: usize,
70 victim: Option<Victim>,
71 rng_state: u64,
72 #[cfg_attr(feature = "serde", serde(skip))]
73 hasher: S,
74 #[cfg_attr(feature = "serde", serde(skip))]
75 _marker: PhantomData<fn(&T)>,
76}
77
78impl<T: ?Sized> CuckooFilter<T, DefaultHashBuilder> {
79 /// Creates a filter able to hold roughly `capacity` items, using the default
80 /// hasher.
81 ///
82 /// The number of buckets is rounded up to a power of two with headroom, so
83 /// the reported [`capacity`](Self::capacity) is at least `capacity` and
84 /// usually larger. Inserting close to the reported capacity may begin to
85 /// fail with [`Error::CapacityExceeded`]; size with margin for write-heavy
86 /// workloads.
87 ///
88 /// # Parameters
89 ///
90 /// - `capacity`: the expected number of distinct items. Must be non-zero.
91 ///
92 /// # Errors
93 ///
94 /// Returns [`Error::InvalidParameter`] if `capacity` is zero.
95 ///
96 /// # Examples
97 ///
98 /// ```
99 /// use bloom_lib::CuckooFilter;
100 ///
101 /// let filter = CuckooFilter::<&str>::new(10_000).unwrap();
102 /// assert!(filter.capacity() >= 10_000);
103 /// assert!(filter.is_empty());
104 /// ```
105 pub fn new(capacity: usize) -> Result<Self, Error> {
106 Self::with_hasher(capacity, DefaultHashBuilder)
107 }
108}
109
110impl<T: ?Sized, S: BuildHasher> CuckooFilter<T, S> {
111 /// Creates a filter for roughly `capacity` items with a caller-supplied
112 /// hasher.
113 ///
114 /// # Errors
115 ///
116 /// Returns [`Error::InvalidParameter`] if `capacity` is zero.
117 ///
118 /// # Examples
119 ///
120 /// ```
121 /// # #[cfg(feature = "std")] {
122 /// use std::collections::hash_map::RandomState;
123 /// use bloom_lib::CuckooFilter;
124 ///
125 /// let filter: CuckooFilter<&str, RandomState> =
126 /// CuckooFilter::with_hasher(1_000, RandomState::new()).unwrap();
127 /// # }
128 /// ```
129 pub fn with_hasher(capacity: usize, hasher: S) -> Result<Self, Error> {
130 if capacity == 0 {
131 return Err(Error::InvalidParameter {
132 param: "capacity",
133 reason: "must be greater than zero",
134 });
135 }
136
137 let num_buckets = capacity_to_buckets(capacity);
138 Ok(Self {
139 buckets: vec![EMPTY; num_buckets * BUCKET_SIZE],
140 mask: num_buckets - 1,
141 len: 0,
142 victim: None,
143 // Non-zero xorshift seed derived from the table size.
144 rng_state: 0x9E37_79B9_7F4A_7C15 ^ num_buckets as u64,
145 hasher,
146 _marker: PhantomData,
147 })
148 }
149
150 /// Inserts `item` into the filter.
151 ///
152 /// Duplicates are permitted: inserting the same item twice stores two
153 /// fingerprints and requires two [`remove`](Self::remove) calls to fully
154 /// evict. This matches the semantics of the original Cuckoo filter and keeps
155 /// `remove` exact with respect to insertions.
156 ///
157 /// # Errors
158 ///
159 /// Returns [`Error::CapacityExceeded`] when the filter is too full to place
160 /// the item within its relocation budget. The filter is unchanged in that
161 /// case and remains usable for lookups and deletions.
162 ///
163 /// # Examples
164 ///
165 /// ```
166 /// use bloom_lib::CuckooFilter;
167 ///
168 /// let mut filter = CuckooFilter::new(100).unwrap();
169 /// filter.insert(&42u32).unwrap();
170 /// assert!(filter.contains(&42u32));
171 /// ```
172 pub fn insert(&mut self, item: &T) -> Result<(), Error>
173 where
174 T: core::hash::Hash,
175 {
176 // A held-out victim means the table is already at its practical limit.
177 if self.victim.is_some() {
178 return Err(Error::CapacityExceeded);
179 }
180
181 let hash = self.hasher.hash_one(item);
182 let fingerprint = fingerprint_of(hash);
183 let i1 = (hash as usize) & self.mask;
184 let i2 = self.alt_index(i1, fingerprint);
185
186 if self.try_put(i1, fingerprint) || self.try_put(i2, fingerprint) {
187 self.len += 1;
188 return Ok(());
189 }
190
191 // Both candidate buckets are full: relocate a resident fingerprint.
192 let mut index = if self.next_rng() & 1 == 0 { i1 } else { i2 };
193 let mut moving = fingerprint;
194 for _ in 0..MAX_KICKS {
195 let slot = (self.next_rng() as usize) & (BUCKET_SIZE - 1);
196 let pos = index * BUCKET_SIZE + slot;
197 core::mem::swap(&mut moving, &mut self.buckets[pos]);
198 index = self.alt_index(index, moving);
199 if self.try_put(index, moving) {
200 self.len += 1;
201 return Ok(());
202 }
203 }
204
205 // Budget exhausted. `moving` is now homeless; stash it in the victim
206 // slot so no previously-inserted element is lost, and count the new
207 // element we successfully placed during the first swap.
208 self.victim = Some(Victim {
209 fingerprint: moving,
210 index,
211 });
212 self.len += 1;
213 Ok(())
214 }
215
216 /// Tests whether `item` is in the filter.
217 ///
218 /// Returns `false` only if the item was definitely never inserted (or has
219 /// since been removed). A return of `true` means the item is *probably*
220 /// present, with a false-positive probability of roughly 0.012%.
221 ///
222 /// # Examples
223 ///
224 /// ```
225 /// use bloom_lib::CuckooFilter;
226 ///
227 /// let mut filter = CuckooFilter::new(100).unwrap();
228 /// filter.insert("present").unwrap();
229 /// assert!(filter.contains("present"));
230 /// assert!(!filter.contains("absent"));
231 /// ```
232 #[must_use]
233 pub fn contains(&self, item: &T) -> bool
234 where
235 T: core::hash::Hash,
236 {
237 let hash = self.hasher.hash_one(item);
238 let fingerprint = fingerprint_of(hash);
239 let i1 = (hash as usize) & self.mask;
240 let i2 = self.alt_index(i1, fingerprint);
241 self.bucket_contains(i1, fingerprint)
242 || self.bucket_contains(i2, fingerprint)
243 || self.victim_matches(fingerprint, i1, i2)
244 }
245
246 /// Removes one occurrence of `item`, returning `true` if it was present.
247 ///
248 /// Only a single stored fingerprint is removed per call, mirroring `insert`.
249 /// Because fingerprints collide, removing an item that was never inserted
250 /// but collides with a present one can delete the wrong fingerprint; this is
251 /// the same false-positive trade-off that governs `contains`. Only delete
252 /// items you previously inserted.
253 ///
254 /// # Examples
255 ///
256 /// ```
257 /// use bloom_lib::CuckooFilter;
258 ///
259 /// let mut filter = CuckooFilter::new(100).unwrap();
260 /// filter.insert("temp").unwrap();
261 /// assert!(filter.remove("temp"));
262 /// assert!(!filter.remove("temp")); // already gone
263 /// ```
264 pub fn remove(&mut self, item: &T) -> bool
265 where
266 T: core::hash::Hash,
267 {
268 let hash = self.hasher.hash_one(item);
269 let fingerprint = fingerprint_of(hash);
270 let i1 = (hash as usize) & self.mask;
271 let i2 = self.alt_index(i1, fingerprint);
272
273 if self.try_take(i1, fingerprint) || self.try_take(i2, fingerprint) {
274 self.len -= 1;
275 self.reinsert_victim();
276 return true;
277 }
278
279 if self.victim_matches(fingerprint, i1, i2) {
280 self.victim = None;
281 self.len -= 1;
282 return true;
283 }
284
285 false
286 }
287
288 /// Removes every item, retaining the allocation.
289 ///
290 /// # Examples
291 ///
292 /// ```
293 /// use bloom_lib::CuckooFilter;
294 ///
295 /// let mut filter = CuckooFilter::new(100).unwrap();
296 /// filter.insert("x").unwrap();
297 /// filter.clear();
298 /// assert!(filter.is_empty());
299 /// ```
300 pub fn clear(&mut self) {
301 self.buckets.iter_mut().for_each(|slot| *slot = EMPTY);
302 self.len = 0;
303 self.victim = None;
304 }
305
306 /// The number of fingerprints currently stored.
307 ///
308 /// # Examples
309 ///
310 /// ```
311 /// use bloom_lib::CuckooFilter;
312 ///
313 /// let mut filter = CuckooFilter::new(100).unwrap();
314 /// filter.insert("a").unwrap();
315 /// filter.insert("b").unwrap();
316 /// assert_eq!(filter.len(), 2);
317 /// ```
318 #[inline]
319 #[must_use]
320 pub fn len(&self) -> usize {
321 self.len
322 }
323
324 /// Returns `true` if the filter holds no items.
325 #[inline]
326 #[must_use]
327 pub fn is_empty(&self) -> bool {
328 self.len == 0
329 }
330
331 /// The maximum number of fingerprints the table can hold.
332 ///
333 /// This is `num_buckets * 4`. Practical fill tops out a little below this
334 /// value before insertions start to fail.
335 #[inline]
336 #[must_use]
337 pub fn capacity(&self) -> usize {
338 self.buckets.len()
339 }
340
341 /// The current load factor in `[0.0, 1.0]`.
342 ///
343 /// # Examples
344 ///
345 /// ```
346 /// use bloom_lib::CuckooFilter;
347 ///
348 /// let filter = CuckooFilter::<u32>::new(100).unwrap();
349 /// assert_eq!(filter.load_factor(), 0.0);
350 /// ```
351 #[must_use]
352 pub fn load_factor(&self) -> f64 {
353 self.len as f64 / self.buckets.len() as f64
354 }
355
356 /// Computes the alternate bucket index for a fingerprint via the XOR trick.
357 ///
358 /// Because XOR is its own inverse, `alt_index(alt_index(i, fp), fp) == i`,
359 /// which lets `contains` and relocation recover either candidate from the
360 /// other without storing it.
361 #[inline]
362 fn alt_index(&self, index: usize, fingerprint: u16) -> usize {
363 index ^ ((mix64(u64::from(fingerprint)) as usize) & self.mask)
364 }
365
366 /// Attempts to place `fingerprint` in a free slot of bucket `index`.
367 #[inline]
368 fn try_put(&mut self, index: usize, fingerprint: u16) -> bool {
369 let base = index * BUCKET_SIZE;
370 for slot in &mut self.buckets[base..base + BUCKET_SIZE] {
371 if *slot == EMPTY {
372 *slot = fingerprint;
373 return true;
374 }
375 }
376 false
377 }
378
379 /// Attempts to remove one `fingerprint` from bucket `index`.
380 #[inline]
381 fn try_take(&mut self, index: usize, fingerprint: u16) -> bool {
382 let base = index * BUCKET_SIZE;
383 for slot in &mut self.buckets[base..base + BUCKET_SIZE] {
384 if *slot == fingerprint {
385 *slot = EMPTY;
386 return true;
387 }
388 }
389 false
390 }
391
392 /// Returns `true` if bucket `index` holds `fingerprint`.
393 #[inline]
394 fn bucket_contains(&self, index: usize, fingerprint: u16) -> bool {
395 let base = index * BUCKET_SIZE;
396 self.buckets[base..base + BUCKET_SIZE].contains(&fingerprint)
397 }
398
399 /// Returns `true` if the held-out victim matches the query.
400 #[inline]
401 fn victim_matches(&self, fingerprint: u16, i1: usize, i2: usize) -> bool {
402 matches!(
403 self.victim,
404 Some(Victim { fingerprint: vf, index: vi })
405 if vf == fingerprint && (vi == i1 || vi == i2)
406 )
407 }
408
409 /// After a deletion frees a slot, try to settle the victim back into the
410 /// table so it stops blocking future insertions.
411 fn reinsert_victim(&mut self) {
412 let Some(victim) = self.victim else {
413 return;
414 };
415 let alt = self.alt_index(victim.index, victim.fingerprint);
416 if self.try_put(victim.index, victim.fingerprint) || self.try_put(alt, victim.fingerprint) {
417 self.victim = None;
418 }
419 }
420
421 /// Advances the internal xorshift PRNG used to pick relocation targets.
422 #[inline]
423 fn next_rng(&mut self) -> u64 {
424 let mut x = self.rng_state;
425 x ^= x << 13;
426 x ^= x >> 7;
427 x ^= x << 17;
428 self.rng_state = x;
429 x
430 }
431}
432
433/// Derives a non-zero 16-bit fingerprint from the high bits of a 64-bit hash.
434#[inline]
435fn fingerprint_of(hash: u64) -> u16 {
436 let fingerprint = (hash >> 48) as u16;
437 if fingerprint == EMPTY {
438 1
439 } else {
440 fingerprint
441 }
442}
443
444/// Rounds a capacity up to a power-of-two bucket count with ~5% headroom.
445fn capacity_to_buckets(capacity: usize) -> usize {
446 let target_load = 0.95;
447 let raw = libm::ceil(capacity as f64 / (BUCKET_SIZE as f64 * target_load)) as usize;
448 raw.max(1).next_power_of_two()
449}
450
451#[cfg(test)]
452mod tests {
453 #![allow(clippy::unwrap_used)]
454
455 use super::*;
456
457 #[test]
458 fn test_new_rejects_zero_capacity() {
459 assert!(matches!(
460 CuckooFilter::<&str>::new(0),
461 Err(Error::InvalidParameter { .. })
462 ));
463 }
464
465 #[test]
466 fn test_insert_contains_remove() {
467 let mut filter = CuckooFilter::new(1_000).unwrap();
468 filter.insert("alice").unwrap();
469 assert!(filter.contains("alice"));
470 assert!(filter.remove("alice"));
471 assert!(!filter.contains("alice"));
472 assert!(!filter.remove("alice"));
473 }
474
475 #[test]
476 fn test_no_false_negatives() {
477 let mut filter = CuckooFilter::new(5_000).unwrap();
478 for i in 0..2_000u32 {
479 filter.insert(&i).unwrap();
480 }
481 for i in 0..2_000u32 {
482 assert!(filter.contains(&i), "inserted item {i} reported absent");
483 }
484 }
485
486 #[test]
487 fn test_duplicates_need_matching_removes() {
488 let mut filter = CuckooFilter::new(100).unwrap();
489 filter.insert("dup").unwrap();
490 filter.insert("dup").unwrap();
491 assert_eq!(filter.len(), 2);
492 assert!(filter.remove("dup"));
493 assert!(filter.contains("dup")); // one copy remains
494 assert!(filter.remove("dup"));
495 assert!(!filter.contains("dup"));
496 }
497
498 #[test]
499 fn test_len_and_clear() {
500 let mut filter = CuckooFilter::new(100).unwrap();
501 assert!(filter.is_empty());
502 for i in 0..10u32 {
503 filter.insert(&i).unwrap();
504 }
505 assert_eq!(filter.len(), 10);
506 filter.clear();
507 assert!(filter.is_empty());
508 assert_eq!(filter.len(), 0);
509 }
510
511 #[test]
512 fn test_capacity_has_headroom() {
513 let filter = CuckooFilter::<u32>::new(1_000).unwrap();
514 assert!(filter.capacity() >= 1_000);
515 // Power-of-two bucket count means capacity is a multiple of 4.
516 assert_eq!(filter.capacity() % BUCKET_SIZE, 0);
517 }
518
519 #[test]
520 fn test_fill_to_capacity_eventually_errors() {
521 // A deliberately small filter; insert distinct items until it refuses.
522 let mut filter = CuckooFilter::<u64>::new(64).unwrap();
523 let mut inserted = 0u64;
524 let mut hit_limit = false;
525 for i in 0..10_000u64 {
526 match filter.insert(&i) {
527 Ok(()) => inserted += 1,
528 Err(Error::CapacityExceeded) => {
529 hit_limit = true;
530 break;
531 }
532 Err(other) => panic!("unexpected error: {other:?}"),
533 }
534 }
535 assert!(hit_limit, "filter never reported CapacityExceeded");
536 // Everything that was accepted must still be found (no false negatives).
537 for i in 0..inserted {
538 assert!(filter.contains(&i), "accepted item {i} reported absent");
539 }
540 }
541
542 #[test]
543 fn test_load_factor() {
544 let mut filter = CuckooFilter::<u32>::new(100).unwrap();
545 let cap = filter.capacity();
546 for i in 0..(cap / 2) as u32 {
547 filter.insert(&i).unwrap();
548 }
549 let lf = filter.load_factor();
550 assert!((0.49..=0.51).contains(&lf), "unexpected load factor {lf}");
551 }
552}