aleph_filter/aleph_filter.rs
1//!
2//! The Aleph Filter is a quotient filter variant that supports:
3//! - O(1) expected insertion and lookup
4//! - Tunable false positive rate (FPR)
5//! - Automatic expansion with fingerprint bit sacrifice
6//! - Deletion support (via tombstones for void entries)
7
8use std::collections::HashMap;
9use crate::hash::{hash_key, split_hash, fingerprint_bits_for_fpr};
10use crate::metadata::SlotMetadata;
11use crate::slot::Slot;
12
13/// The main Aleph Filter data structure.
14///
15/// Stores fingerprints in a compact quotient-filter format with dynamic expansion.
16/// Each element occupies one logical "slot" with data stored in `slots` and metadata in `metadata`.
17#[derive(Clone)]
18pub struct AlephFilter {
19 /// Fingerprint + metadata storage. Length = num_slots + num_extension_slots.
20 /// Stores packed fingerprints for compact memory usage.
21 slots: Vec<Slot>,
22
23 /// Per-slot metadata flags (occupied, continuation, shifted).
24 /// Tracks structure of runs and clusters.
25 metadata: Vec<SlotMetadata>,
26
27 /// Number of logical slots (always power of 2).
28 /// The "quotient" component of the quotient filter.
29 num_slots: usize,
30
31 /// Extra overflow slots at the end (linear, no wraparound).
32 /// Prevents constant expansion and supports insertion collisions.
33 num_extension_slots: usize,
34
35 /// log2(num_slots) - used to extract quotient from hash.
36 quotient_bits: u32,
37
38 /// Fingerprint bits at creation time.
39 /// Decreases by 1 with each `expand()` to make space for quotient expansion.
40 base_fp_bits: u32,
41
42 /// Number of times the filter has expanded.
43 /// Each expansion doubles slots and sacrifices 1 fingerprint bit per entry.
44 num_expansions: usize,
45
46 /// Current number of distinct items inserted.
47 num_items: usize,
48
49 /// Reserved for future use (e.g., storing full hashes for deletions).
50 mother_hashes: HashMap<usize, u64>,
51
52 /// Load factor threshold. When `num_items / num_slots >= max_load_factor`, expand.
53 max_load_factor: f64,
54}
55
56// PUBLIC API
57
58impl AlephFilter {
59 /// Creates a new Aleph Filter sized for `expected_items` with target `fpr`.
60 ///
61 /// Automatically computes internal parameters (slots, fingerprint width) to meet the FPR.
62 ///
63 /// # Arguments
64 ///
65 /// * `expected_items` - Expected number of items to insert (must be > 0)
66 /// * `fpr` - Target false positive rate, e.g., 0.01 for 1% (must be in (0, 1))
67 ///
68 /// # Returns
69 ///
70 /// A new empty AlephFilter instance
71 ///
72 /// # Panics
73 ///
74 /// If `expected_items <= 0` or `fpr` not in (0, 1).
75 pub fn new(expected_items: usize, fpr: f64) -> Self {
76 assert!(expected_items > 0);
77 assert!(fpr > 0.0 && fpr < 1.0);
78 let raw = (expected_items as f64 / 0.8).ceil() as usize;
79 let num_slots = raw.next_power_of_two().max(8);
80 let q = num_slots.trailing_zeros();
81 let fp = fingerprint_bits_for_fpr(fpr);
82 let ext = (q as usize) * 2; // Match Java: power_of_two_size * 2
83 return Self {
84 slots: vec![Slot::empty(); num_slots + ext],
85 metadata: vec![SlotMetadata::new(); num_slots + ext],
86 num_slots, num_extension_slots: ext,
87 quotient_bits: q, base_fp_bits: fp,
88 num_expansions: 0, num_items: 0,
89 mother_hashes: HashMap::new(),
90 max_load_factor: 0.8, // Match Java: fullness_threshold = 0.8
91 };
92 }
93
94 /// Creates a new Aleph Filter with manual parameters.
95 ///
96 /// Use this constructor when you already know the desired slot count and fingerprint width.
97 ///
98 /// # Arguments
99 ///
100 /// * `num_slots` - Desired number of logical slots (rounded up to next power of 2)
101 /// * `remainder_bits` - Fingerprint width in bits
102 ///
103 /// # Returns
104 ///
105 /// A new empty AlephFilter instance
106 pub fn with_params(num_slots: usize, remainder_bits: u32) -> Self {
107 let num_slots = num_slots.next_power_of_two().max(8);
108 let q = num_slots.trailing_zeros();
109 let ext = (q as usize) * 2; // Match Java: power_of_two_size * 2
110 return Self {
111 slots: vec![Slot::empty(); num_slots + ext],
112 metadata: vec![SlotMetadata::new(); num_slots + ext],
113 num_slots, num_extension_slots: ext,
114 quotient_bits: q, base_fp_bits: remainder_bits,
115 num_expansions: 0, num_items: 0,
116 mother_hashes: HashMap::new(),
117 max_load_factor: 0.8, // Match Java: fullness_threshold = 0.8
118 };
119 }
120
121 /// Inserts a key into the filter.
122 ///
123 /// May trigger an automatic expansion if load factor exceeds threshold.
124 /// Supports duplicate insertions (see contains for lookup semantics).
125 ///
126 /// # Arguments
127 ///
128 /// * `key` - Byte slice to insert
129 ///
130 /// # Complexity
131 ///
132 /// O(1) amortized expected time
133 pub fn insert(&mut self, key: &[u8]) {
134 if self.load_factor() >= self.max_load_factor {
135 self.expand();
136 }
137 let hash = hash_key(key);
138 let r = self.fp_bits();
139 let (slot_idx, fp) = split_hash(hash, self.quotient_bits, r);
140 let canonical = slot_idx as usize;
141 let slot = if r == 0 {
142 Slot::void_marker() // No fingerprint bits left -> void
143 } else {
144 Slot::new(fp, r as u8)
145 };
146 self.qf_insert_slot(canonical, slot);
147 self.num_items += 1;
148 return;
149 }
150
151 /// Checks if a key might be in the filter.
152 ///
153 /// Returns `true` if the key is definitely in the filter.
154 /// May return `true` for keys not inserted (false positive at rate `fpr`).
155 /// Never returns `true` for keys definitely not inserted (no false negatives).
156 ///
157 /// # Arguments
158 ///
159 /// * `key` - Byte slice to query
160 ///
161 /// # Returns
162 ///
163 /// `true` if key is in filter (or hash collision), `false` if definitely not in filter
164 ///
165 /// # Complexity
166 ///
167 /// O(1) expected time
168 pub fn contains(&self, key: &[u8]) -> bool {
169 let hash = hash_key(key);
170 let r = self.fp_bits();
171 let (slot_idx, fp) = split_hash(hash, self.quotient_bits, r);
172 let canonical = slot_idx as usize;
173 return self.qf_search(fp, canonical);
174 }
175
176 /// Removes a key from the filter, if present.
177 ///
178 /// Deleting void entries (exhausted fingerprints) marks them with a tombstone.
179 /// Deleting regular entries compacts the run by shifting elements left.
180 ///
181 /// # Arguments
182 ///
183 /// * `key` - Byte slice to delete
184 ///
185 /// # Returns
186 ///
187 /// `true` if a matching entry was found and deleted, `false` if not found
188 ///
189 /// # Complexity
190 ///
191 /// O(1) expected time
192 pub fn delete(&mut self, key: &[u8]) -> bool {
193 let hash = hash_key(key);
194 let r = self.fp_bits();
195 let (slot_idx, fp) = split_hash(hash, self.quotient_bits, r);
196 let canonical = slot_idx as usize;
197 if self.qf_delete(fp, canonical) {
198 self.num_items = self.num_items.saturating_sub(1);
199 return true;
200 } else {
201 return false;
202 }
203 }
204
205 /// Returns the number of items inserted, accounting for duplicates.
206 ///
207 /// # Returns
208 ///
209 /// Count of insertion operations
210 #[inline] pub fn len(&self) -> usize { return self.num_items; }
211
212 /// Returns `true` if the filter contains no items.
213 ///
214 /// # Returns
215 ///
216 /// `true` if empty, `false` otherwise
217 #[inline] pub fn is_empty(&self) -> bool { return self.num_items == 0; }
218
219 /// Returns the current load factor (items / slots).
220 ///
221 /// # Returns
222 ///
223 /// Ratio in [0, 1+) (can exceed 1 due to extension slots)
224 #[inline] pub fn load_factor(&self) -> f64 { return self.num_items as f64 / self.num_slots as f64; }
225
226 /// Returns the current logical slot count.
227 ///
228 /// # Returns
229 ///
230 /// Number of quotient-addressable slots
231 #[inline] pub fn capacity(&self) -> usize { return self.num_slots; }
232
233 /// Returns the number of times the filter has expanded.
234 ///
235 /// Each expansion doubles slots and sacrifices 1 fingerprint bit per entry.
236 ///
237 /// # Returns
238 ///
239 /// Expansion count
240 #[inline] pub fn num_expansions(&self) -> usize { return self.num_expansions; }
241
242 /// Returns the current number of quotient bits (log2 of logical slots).
243 ///
244 /// # Returns
245 ///
246 /// Quotient bits
247 #[inline] pub fn quotient_bits(&self) -> u32 { return self.quotient_bits; }
248}
249
250// INTERNAL HELPERS
251
252impl AlephFilter {
253 /// Returns total addressable slots (logical + extension).
254 ///
255 /// Used internally to guard against out-of-bounds access.
256 ///
257 /// # Returns
258 ///
259 /// Total array size
260 #[inline]
261 fn total_slots(&self) -> usize { return self.num_slots + self.num_extension_slots; }
262
263 /// Returns the current fingerprint width in bits.
264 ///
265 /// Decreases by 1 with each expansion to make room for additional quotient bits.
266 /// Uses saturating subtraction to prevent underflow.
267 ///
268 /// # Returns
269 ///
270 /// Fingerprint bits available (>= 0)
271 #[inline]
272 pub fn fp_bits(&self) -> u32 { return self.base_fp_bits.saturating_sub(self.num_expansions as u32); }
273
274 /// Checks if a slot is completely empty (no data and no metadata flags set).
275 ///
276 /// A slot is empty only when all three metadata flags (occupied, continuation, shifted)
277 /// are off AND the slot data is zero.
278 ///
279 /// # Arguments
280 ///
281 /// * `i` - Slot index
282 ///
283 /// # Returns
284 ///
285 /// `true` if slot is fully empty
286 #[inline]
287 fn is_slot_empty(&self, i: usize) -> bool {
288 return !self.metadata[i].is_occupied()
289 && !self.metadata[i].is_continuation()
290 && !self.metadata[i].is_shifted();
291 }
292
293 /// Atomically swaps a slot, returning the old value.
294 ///
295 /// Used by insertion logic to push elements right.
296 ///
297 /// # Arguments
298 ///
299 /// * `i` - Slot index
300 /// * `new_slot` - Value to place in slot
301 ///
302 /// # Returns
303 ///
304 /// Previous value at slot `i`
305 fn swap_slot(&mut self, i: usize, new_slot: Slot) -> Slot {
306 let old = self.slots[i];
307 self.slots[i] = new_slot;
308 return old;
309 }
310}
311
312// QUOTIENT FILTER CORE
313
314impl AlephFilter {
315 /// Finds the start position of the run for a given canonical slot.
316 ///
317 /// The algorithm:
318 /// 1. Walk backward from canonical, counting occupied slots while shifted flag is set.
319 /// 2. Walk forward from initial position, counting run starts (non-continuation slots).
320 /// 3. Return position of the Nth run start where N = count from phase 1.
321 ///
322 /// This determines which run a canonical slot belongs to, even if shifted.
323 ///
324 /// # Arguments
325 ///
326 /// * `canonical` - Quotient (canonical slot index)
327 ///
328 /// # Returns
329 ///
330 /// Index of the run's first slot
331 ///
332 fn find_run_start(&self, canonical: usize) -> usize {
333 let mut pos = canonical;
334 let mut skip: i64 = 1;
335
336 // Phase 1: walk backward counting occupied slots
337 while self.metadata[pos].is_shifted() {
338 if self.metadata[pos].is_occupied() {
339 skip += 1;
340 }
341 if pos == 0 { break; }
342 pos -= 1;
343 }
344
345 // Phase 2: walk forward counting run starts (non-continuation)
346 loop {
347 if !self.metadata[pos].is_continuation() {
348 skip -= 1;
349 if skip == 0 {
350 return pos;
351 }
352 }
353 pos += 1;
354 if pos >= self.total_slots() { return canonical; }
355 }
356 }
357
358 /// Finds a suitable starting position for a new run.
359 ///
360 /// Starting from a run start, skips past the run and any continuation slots.
361 /// Returns the first empty or non-continuation position where a new run can begin.
362 ///
363 /// # Arguments
364 ///
365 /// * `index` - Starting position (typically a run start)
366 ///
367 /// # Returns
368 ///
369 /// Position suitablefor inserting a new run
370 ///
371 fn find_new_run_location(&self, index: usize) -> usize {
372 let mut pos = index;
373 if pos < self.total_slots() && !self.is_slot_empty(pos) {
374 pos += 1;
375 }
376 while pos < self.total_slots() && self.metadata[pos].is_continuation() {
377 pos += 1;
378 }
379 return pos;
380 }
381
382
383 /// Inserts a slot at its canonical position, dispatching to appropriate insertion logic.
384 ///
385 /// If no run exists at canonical, creates a new run.
386 /// If a run exists, inserts into the existing run.
387 ///
388 /// # Arguments
389 ///
390 /// * `canonical` - Quotient (canonical slot index)
391 /// * `slot` - Slot data to insert (fingerprint + length)
392 ///
393 /// # Returns
394 ///
395 /// `true` on success, `false` if array is full
396 fn qf_insert_slot(&mut self, canonical: usize, slot: Slot) -> bool {
397 let does_run_exist = self.metadata[canonical].is_occupied();
398 if !does_run_exist {
399 return self.insert_new_run(canonical, slot);
400 } else {
401 let run_start = self.find_run_start(canonical);
402 return self.insert_into_run(slot, run_start);
403 }
404 }
405
406 /// Creates a new run for a canonical slot and inserts the slot.
407 ///
408 /// Sets the occupied flag, shifts metadata, and performs a push-right cascade
409 /// if the insertion position is already occupied.
410 ///
411 /// # Arguments
412 ///
413 /// * `canonical` - Quotient (canonical slot index)
414 /// * `slot` - Slot data to insert
415 ///
416 /// # Returns
417 ///
418 /// `true` on success, `false` if array is full
419 ///
420 fn insert_new_run(&mut self, canonical: usize, slot: Slot) -> bool {
421 let run_start = self.find_run_start(canonical);
422 let insert_pos = self.find_new_run_location(run_start);
423
424 let slot_empty = self.is_slot_empty(insert_pos);
425
426 self.metadata[canonical].set_occupied(true);
427 if insert_pos != canonical {
428 self.metadata[insert_pos].set_shifted(true);
429 }
430 self.metadata[insert_pos].set_continuation(false);
431
432 if slot_empty {
433 self.slots[insert_pos] = slot;
434 return true;
435 }
436
437 // Push everything right via swap chain
438 let mut current_slot = slot;
439 let mut pos = insert_pos;
440 let mut temp_cont = false;
441 loop {
442 if pos >= self.total_slots() { return false; }
443 let was_empty = self.is_slot_empty(pos);
444 current_slot = self.swap_slot(pos, current_slot);
445
446 if pos > insert_pos {
447 self.metadata[pos].set_shifted(true);
448 }
449 if pos > insert_pos {
450 let cur_cont = self.metadata[pos].is_continuation();
451 self.metadata[pos].set_continuation(temp_cont);
452 temp_cont = cur_cont;
453 }
454 pos += 1;
455 if was_empty { break; }
456 }
457 return true;
458 }
459
460 /// Inserts a slot into an existing run, preserving run structure.
461 ///
462 /// Scans forward through the run, marking boundaries and performing push-right
463 /// when necessary. Maintains continuation flags to mark continuation slots.
464 ///
465 /// # Arguments
466 ///
467 /// * `slot` - Slot data to insert
468 /// * `run_start` - Index of run's first slot
469 ///
470 /// # Returns
471 ///
472 /// `true` on success, `false` if array is full
473 ///
474 fn insert_into_run(&mut self, slot: Slot, run_start: usize) -> bool {
475 let mut current_slot = slot;
476 let mut pos = run_start;
477 let mut finished_first_run = false;
478 let mut temp_cont = false;
479
480 loop {
481 if pos >= self.total_slots() { return false; }
482 let was_empty = self.is_slot_empty(pos);
483
484 if pos > run_start {
485 self.metadata[pos].set_shifted(true);
486 }
487
488 if pos > run_start && !finished_first_run && !self.metadata[pos].is_continuation() {
489 finished_first_run = true;
490 self.metadata[pos].set_continuation(true);
491 current_slot = self.swap_slot(pos, current_slot);
492 } else if finished_first_run {
493 let cur_cont = self.metadata[pos].is_continuation();
494 self.metadata[pos].set_continuation(temp_cont);
495 temp_cont = cur_cont;
496 current_slot = self.swap_slot(pos, current_slot);
497 }
498
499 pos += 1;
500 if was_empty { break; }
501 }
502 return true;
503 }
504
505 /// Searches for a fingerprint in the run for a canonical slot.
506 ///
507 /// Returns `false` early if the canonical slot is not occupied.
508 /// Otherwise, locates the run and scans it for a match.
509 ///
510 /// # Arguments
511 ///
512 /// * `fp` - Fingerprint to search for
513 /// * `canonical` - Quotient (canonical slot index)
514 ///
515 /// # Returns
516 ///
517 /// `true` if fingerprint is found in the run, `false` otherwise
518 ///
519 fn qf_search(&self, fp: u64, canonical: usize) -> bool {
520 if !self.metadata[canonical].is_occupied() {
521 return false;
522 }
523 let run_start = self.find_run_start(canonical);
524 return self.find_in_run(run_start, fp);
525 }
526
527 /// Scans a run linearly for a matching fingerprint.
528 ///
529 /// Iterates from start, checking each slot against the target fingerprint.
530 /// Skips tombstones and stops at the end of the run (when continuation flag is off).
531 ///
532 /// # Arguments
533 ///
534 /// * `start` - Starting index of run
535 /// * `fp` - Fingerprint to match
536 ///
537 /// # Returns
538 ///
539 /// `true` if a matching fingerprint is found, `false` otherwise
540 ///
541 fn find_in_run(&self, start: usize, fp: u64) -> bool {
542 let mut pos = start;
543 let r = self.fp_bits();
544 loop {
545 if self.slots[pos].is_tombstone() {
546 // skip tombstones
547 } else if self.slots[pos].matches(fp, r as u8) {
548 return true;
549 }
550 pos += 1;
551 if pos >= self.total_slots() || !self.metadata[pos].is_continuation() {
552 break;
553 }
554 }
555 false
556 }
557
558 /// Finds the start of the cluster containing the given index.
559 ///
560 /// Walks backward until finding a slot that is not shifted.
561 /// Matches Java's `find_cluster_start`.
562 fn find_cluster_start(&self, index: usize) -> usize {
563 let mut pos = index;
564 while pos > 0 && self.metadata[pos].is_shifted() {
565 pos -= 1;
566 }
567 return pos;
568 }
569
570 /// Finds the end of the run starting at `index`.
571 ///
572 /// Walks forward while continuation flag is set.
573 /// Matches Java's `find_run_end`.
574 fn find_run_end(&self, index: usize) -> usize {
575 let mut pos = index;
576 while pos + 1 < self.total_slots() && self.metadata[pos + 1].is_continuation() {
577 pos += 1;
578 }
579 return pos;
580 }
581
582 /// Deletes a fingerprint from the run for a canonical slot.
583 ///
584 /// Uses the Java paper's full cluster-aware deletion:
585 /// 1. Find the matching fingerprint in the run
586 /// 2. Shift entries within the run backward to fill the gap
587 /// 3. For each subsequent shifted run in the cluster, shift it back by one slot
588 /// 4. Clear metadata at the vacated position
589 /// 5. Clear occupied flag if the run is now empty
590 ///
591 /// This is more correct than run-local deletion because it properly
592 /// compacts the entire cluster, maintaining metadata consistency.
593 fn qf_delete(&mut self, fp: u64, canonical: usize) -> bool {
594 if canonical >= self.num_slots {
595 return false;
596 }
597 if !self.metadata[canonical].is_occupied() {
598 return false;
599 }
600 let run_start = self.find_run_start(canonical);
601 let r = self.fp_bits();
602
603 // Find the matching fingerprint (search for last match, like Java's decide_which_fingerprint_to_delete)
604 let mut pos = run_start;
605 let mut found: Option<usize> = None;
606 loop {
607 if !self.slots[pos].is_tombstone() && self.slots[pos].matches(fp, r as u8) {
608 found = Some(pos);
609 }
610 pos += 1;
611 if pos >= self.total_slots() || !self.metadata[pos].is_continuation() {
612 break;
613 }
614 }
615
616 let del_pos = match found {
617 Some(p) => p,
618 None => return false,
619 };
620
621 // Void entries get tombstoned (matching Java's behavior for void deletes)
622 if self.slots[del_pos].is_void() {
623 self.slots[del_pos] = Slot::tombstone();
624 return true;
625 }
626
627 let mut run_end = self.find_run_end(del_pos);
628
629 // Check if this run has only one entry (will need to clear occupied flag)
630 let turn_off_occupied = run_start == run_end;
631
632 // Shift entries within this run backward to fill the gap
633 for i in del_pos..run_end {
634 self.slots[i] = self.slots[i + 1];
635 }
636
637 // Count continuation and non-occupied flags from cluster start to run_end
638 // This tells us how far entries are shifted from their canonical positions
639 let cluster_start = self.find_cluster_start(canonical);
640 let mut num_shifted_count: i64 = 0;
641 let mut num_non_occupied: i64 = 0;
642 for i in cluster_start..=run_end {
643 if self.metadata[i].is_continuation() {
644 num_shifted_count += 1;
645 }
646 if !self.metadata[i].is_occupied() {
647 num_non_occupied += 1;
648 }
649 }
650
651 // Clear the vacated slot at run_end
652 self.slots[run_end] = Slot::empty();
653 self.metadata[run_end].set_shifted(false);
654 self.metadata[run_end].set_continuation(false);
655
656 // Now shift all subsequent runs in the cluster backward by one slot
657 loop {
658 // Check if there's a next run that needs shifting
659 if run_end + 1 >= self.total_slots()
660 || self.is_slot_empty(run_end + 1)
661 || !self.metadata[run_end + 1].is_shifted()
662 {
663 if turn_off_occupied {
664 self.metadata[canonical].set_occupied(false);
665 }
666 return true;
667 }
668
669 // Find the next run
670 let next_run_start = run_end + 1;
671 run_end = self.find_run_end(next_run_start);
672
673 // Check if the slot before this run is now back at its canonical position
674 if self.metadata[next_run_start - 1].is_occupied()
675 && num_shifted_count - num_non_occupied == 1
676 {
677 self.metadata[next_run_start - 1].set_shifted(false);
678 } else {
679 self.metadata[next_run_start - 1].set_shifted(true);
680 }
681
682 // Shift each entry in this run back by one
683 for i in next_run_start..=run_end {
684 self.slots[i - 1] = self.slots[i];
685 if self.metadata[i].is_continuation() {
686 self.metadata[i - 1].set_continuation(true);
687 }
688 if !self.metadata[i].is_occupied() {
689 num_non_occupied += 1;
690 }
691 }
692 num_shifted_count += (run_end - next_run_start) as i64;
693
694 // Clear the vacated slot
695 self.slots[run_end] = Slot::empty();
696 self.metadata[run_end].set_shifted(false);
697 self.metadata[run_end].set_continuation(false);
698 }
699 }
700}
701
702// EXPANSION
703impl AlephFilter {
704 /// Doubles the filter size and re-hashes all entries.
705 ///
706 /// During expansion:
707 /// 1. Collects all entries using `iterate_entries()`
708 /// 2. Doubles logical slots and increments quotient bits
709 /// 3. Sacrifices 1 fingerprint bit per entry (`fp_bits` decreases)
710 /// 4. Re-inserts entries using pivot bit to determine new canonical slot
711 /// 5. Consolidates void markers to prevent exponential void growth
712 ///
713 /// This supports indefinite insertion up to memory limits.
714 ///
715 /// # Complexity
716 ///
717 /// O(n) where n is current item count
718 ///
719 fn expand(&mut self) {
720 // Collect all (canonical, fp, fp_len, is_void) using Iterator
721 let entries = self.iterate_entries();
722 let old_q_bits = self.quotient_bits;
723
724 // Double the filter
725 self.num_slots *= 2;
726 self.quotient_bits += 1;
727 self.num_expansions += 1;
728 self.num_extension_slots += 2; // Match Java: num_extension_slots += 2
729 self.slots = vec![Slot::empty(); self.total_slots()];
730 self.metadata = vec![SlotMetadata::new(); self.total_slots()];
731 self.num_items = 0;
732 self.mother_hashes.clear();
733
734 // Reinsert each entry
735
736 // Track slots that already have a void to prevent exponential void growth
737 let mut void_slots: std::collections::HashSet<usize> = std::collections::HashSet::new();
738
739 for (bucket, fingerprint, fp_len, is_void, _old_pos) in entries {
740 if is_void {
741 // Void: insert to both possible new canonical slots (deduped)
742 let s0 = bucket;
743 let s1 = bucket | (1 << old_q_bits);
744 if s0 < self.num_slots && void_slots.insert(s0) {
745 self.qf_insert_slot(s0, Slot::void_marker());
746 self.num_items += 1;
747 }
748 if s1 < self.num_slots && void_slots.insert(s1) {
749 self.qf_insert_slot(s1, Slot::void_marker());
750 self.num_items += 1;
751 }
752 } else if fp_len > 0 {
753 // Steal lowest fp bit and use it to extend the slot address
754 let pivot = fingerprint & 1;
755 let new_bucket = bucket | ((pivot as usize) << old_q_bits);
756 let new_fp = fingerprint >> 1;
757 let new_len = fp_len - 1;
758
759 if new_bucket < self.num_slots {
760 if new_len == 0 {
761 self.qf_insert_slot(new_bucket, Slot::void_marker());
762 } else {
763 self.qf_insert_slot(new_bucket, Slot::new(new_fp, new_len));
764 }
765 self.num_items += 1;
766 }
767 }
768 }
769 return;
770 }
771
772 /// Extracts all entries as tuples for expansion.
773 ///
774 /// Returns a list of `(canonical, fingerprint, fingerprint_len, is_void, position)` tuples.
775 /// Uses the iterator algorithm to track canonical slots while scanning the slot array.
776 ///
777 /// The canonical slot is computed by tracking:
778 /// - Occupied non-shifted slots (cluster starts)
779 /// - Occupied shifted slots and continuations within clusters
780 /// - Queue of pending runs
781 ///
782 /// # Returns
783 ///
784 /// Vec of (canonical_slot, fp_bits, fp_length, is_void, storage_position)
785 ///
786 fn iterate_entries(&self) -> Vec<(usize, u64, u8, bool, usize)> {
787 let mut result = Vec::new();
788 let mut queue: Vec<usize> = Vec::new();
789 let mut current_bucket: usize = 0;
790
791 let mut index: usize = 0;
792 while index < self.total_slots() {
793 // Skip empty slots
794 if self.is_slot_empty(index) && self.slots[index].is_empty() {
795 index += 1;
796 continue;
797 }
798
799 let occ = self.metadata[index].is_occupied();
800 let cont = self.metadata[index].is_continuation();
801 let shift = self.metadata[index].is_shifted();
802
803 if occ && !cont && !shift {
804 queue.clear();
805 queue.push(index);
806 current_bucket = index;
807 } else if occ && cont && shift {
808 queue.push(index);
809 } else if !occ && !cont && shift {
810 if !queue.is_empty() { queue.remove(0); }
811 current_bucket = queue.first().copied().unwrap_or(index);
812 } else if !occ && cont && shift {
813 // continuation — bucket unchanged
814 } else if occ && !cont && shift {
815 queue.push(index);
816 if !queue.is_empty() { queue.remove(0); }
817 current_bucket = queue.first().copied().unwrap_or(index);
818 }
819
820 // Record non-empty data slots
821 if !self.slots[index].is_empty() && !self.slots[index].is_tombstone() {
822 result.push((
823 current_bucket,
824 self.slots[index].fingerprint(),
825 self.slots[index].length(),
826 self.slots[index].is_void(),
827 index,
828 ));
829 }
830
831 index += 1;
832 }
833 return result;
834 }
835}
836
837// DEBUG
838
839impl std::fmt::Debug for AlephFilter {
840 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
841 f.debug_struct("AlephFilter")
842 .field("num_items", &self.num_items)
843 .field("num_slots", &self.num_slots)
844 .field("num_expansions", &self.num_expansions)
845 .field("load_factor", &self.load_factor())
846 .finish()
847 }
848}
849