bitcoin-cuckoo-cache 0.1.19

high performance cache primitives
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
// ---------------- [ File: bitcoin-cuckoo-cache/src/cuckoocache.rs ]
/// High-performance cache primitives.
/// 
/// Summary:
/// 
/// 1. @ref bit_packed_atomic_flags is bit-packed atomic flags for garbage collection
/// 
/// 2. @ref cache is a cache which is performant in memory usage and lookup speed. It is lockfree
/// for erase operations. Elements are lazily erased on the next insert.
/// 

crate::ix!();

/// A trait providing *eight* **independent** 32‑bit hashes for a value.
///
/// The reference C++ implementation uses a templated `operator()<k>(e)` to
/// obtain the *k*‑th hash; in Rust we expose all eight at once.
pub trait EightWayHasher<E> {
    /// Return eight high‑entropy hashes for `e`.
    ///
    /// Implementations **must** ensure that changing `e` changes every entry
    /// in the returned array with near‑uniform probability.
    fn hashes(&self, e: &E) -> [u32; 8];
}

//-------------------------------------------[.cpp/bitcoin/src/cuckoocache.h]

/// @ref cache implements a cache with properties similar to a cuckoo-set.
/// 
/// The cache is able to hold up to `(~(uint32_t)0) - 1` elements.
/// 
///  Read Operations:
///      - contains() for `erase=false`
/// 
///  Read+Erase Operations:
///      - contains() for `erase=true`
/// 
///  Erase Operations:
///      - allow_erase()
/// 
///  Write Operations:
///      - setup()
///      - setup_bytes()
///      - insert()
///      - please_keep()
/// 
///  Synchronization Free Operations:
///      - invalid()
///      - compute_hashes()
/// 
/// User Must Guarantee:
/// 
/// -1. Write requires synchronized access (e.g. a lock)
/// 
/// -2. Read requires no concurrent Write, synchronized with last insert.
/// 
/// -3. Erase requires no concurrent Write, synchronized with last insert.
/// 
/// -4. An Erase caller must release all memory before allowing a new Writer.
/// 
/// Note on function names:
/// 
///   - The name "allow_erase" is used because the real discard happens later.
/// 
///   - The name "please_keep" is used because elements may be erased anyways on insert.
/// 
/// @tparam Element should be a movable and copyable type
/// 
/// @tparam Hash should be a function/callable which takes a template parameter
/// 
/// hash_select and an Element and extracts a hash from it. Should return high-entropy uint32_t
/// hashes for `Hash h; h<0>(e) ... h<7>(e)`.
///
/// The Rust implementation keeps the exact algorithmic behaviour – including
/// lazy reclamation & epoch aging – while relying on safe interior mutability
/// (`AtomicU8` and `BitPackedAtomicFlags`) in place of the C++ `mutable` hack.
///
#[derive(Getters, Debug)]
#[get = "pub(crate)"]
pub struct Cache<Element, Hash>
where
    Element: PartialEq + Clone,
    Hash: EightWayHasher<Element>,
{
    /// Stores the user elements.  `None` means “never initialised”.
    ///
    table: Vec<Option<Element>>,

    /// size stores the total available slots in the hash table
    ///
    /// Number of buckets (always `>= 2`).
    ///
    size: u32,

    /// The bit_packed_atomic_flags array is marked mutable because we want
    /// garbage collection to be allowed to occur from const methods.
    ///
    /// Flags indicating whether a bucket **may** be overwritten (`true`) or
    /// **must** be kept (`false`).
    ///
    collection_flags: BitPackedAtomicFlags,

    /// epoch_flags tracks how recently an element was inserted into the
    /// cache. true denotes recent, false denotes not-recent. See insert()
    /// method for full semantics.
    ///
    /// `true` – member belongs to the current (youngest) epoch.  
    ///
    /// `false` – member belongs to the previous epoch and will be erased when
    ///           the epoch advances once more.
    ///
    epoch_flags: Vec<bool>,

    /// Cheap epoch‑aging heuristic; decremented on every `insert`.
    ///
    /// epoch_heuristic_counter is used to determine when an epoch might be aged
    /// & an expensive scan should be done. epoch_heuristic_counter is
    /// decremented on insert and reset to the new number of inserts which would
    /// cause the epoch to reach epoch_size when it reaches zero.
    ///
    epoch_heuristic_counter: u32,

    /// Maximum number of live entries that fit into a single epoch.
    ///
    /// epoch_size is set to be the number of elements supposed to be in
    /// a epoch. When the number of non-erased elements in an epoch exceeds
    /// epoch_size, a new epoch should be started and all current entries
    /// demoted. epoch_size is set to be 45% of size because we want to keep
    /// load around 90%, and we support 3 epochs at once -- one "dead" which has
    /// been erased, one "dying" which has been marked to be erased next, and
    /// one "living" which new inserts add to.
    /// 
    epoch_size: u32,

    /// Maximum chain length the Cuckoo insertion algorithm will explore.
    ///
    /// depth_limit determines how many elements insert should try to replace.
    /// 
    /// Should be set to log2(n).
    /// 
    depth_limit: u8,

    /// User‑supplied eight‑way hash provider.
    ///
    /// hash_function is a const instance of the hash function. It cannot be
    /// static or initialized at call time as it may have internal state (such
    /// as a nonce).
    /// 
    hash_function: Hash,
}

impl<E, H> Default for Cache<E, H>
where
    E: PartialEq + Clone,
    H: EightWayHasher<E> + Default,
{
    /// You must always construct a cache with some elements via a subsequent call to setup or
    /// setup_bytes, otherwise operations may segfault.
    fn default() -> Self {
        // A completely empty cache – *must* be `setup()` before use.
        Self {
            table: Vec::new(),
            size: 0,
            collection_flags: BitPackedAtomicFlags::new(0),
            epoch_flags: Vec::new(),
            epoch_heuristic_counter: 0,
            epoch_size: 0,
            depth_limit: 0,
            hash_function: H::default(),
        }
    }
}

impl<E, H> Cache<E, H>
where
    E: PartialEq + Clone,
    H: EightWayHasher<E>,
{
    /// Map the eight raw hashes for `e` onto bucket indices in `[0, size)`.
    ///
    /// compute_hashes is convenience for not having to write out this expression everywhere we use
    /// the hash values of an Element.
    /// 
    /// We need to map the 32-bit input hash onto a hash bucket in a range [0, size) in a manner
    /// which preserves as much of the hash's uniformity as possible. Ideally this would be done by
    /// bitmasking but the size is usually not a power of two.
    /// 
    /// The naive approach would be to use a mod -- which isn't perfectly uniform but so long as
    /// the hash is much larger than size it is not that bad. Unfortunately, mod/division is fairly
    /// slow on ordinary microprocessors (e.g. 90-ish cycles on haswell, ARM doesn't even have an
    /// instruction for it.); when the divisor is a constant the compiler will do clever tricks to
    /// turn it into a multiply+add+shift, but size is a run-time value so the compiler can't do
    /// that here.
    /// 
    /// One option would be to implement the same trick the compiler uses and compute the constants
    /// for exact division based on the size, as described in "{N}-bit Unsigned
    /// 
    /// Division via {N}-bit Multiply-Add" by Arch D. Robison in 2005. But that code is somewhat
    /// complicated and the result is still slower than other options:
    /// 
    /// Instead we treat the 32-bit random number as a Q32 fixed-point number in the range [0, 1)
    /// and simply multiply it by the size. Then we just shift the result down by 32-bits to get
    /// our bucket number. The result has non-uniformity the same as a mod, but it is much faster
    /// to compute. More about this technique can be found at
    /// https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ .
    /// 
    /// The resulting non-uniformity is also more equally distributed which would be advantageous
    /// for something like linear probing, though it shouldn't matter one way or the other for
    /// a cuckoo table.
    /// 
    /// The primary disadvantage of this approach is increased intermediate precision is required
    /// but for a 32-bit random number we only need the high 32 bits of a 32*32->64 multiply, which
    /// means the operation is reasonably fast even on a typical 32-bit processor.
    /// 
    /// -----------
    /// @param e
    /// 
    /// The element whose hashes will be returned
    /// 
    /// -----------
    /// @return
    /// 
    /// Deterministic hashes derived from
    /// `e` uniformly mapped onto the range
    /// [0, size)
    /// 

    #[inline]
    pub fn compute_hashes(&self, e: &E) -> [u32; 8] {
        let size = self.size as u64;
        self.hash_function
            .hashes(e)
            .map(|h| ((h as u64 * size) >> 32) as u32)
    }

    /// A bucket index that is *never* produced by [`compute_hashes`].
    ///
    /// invalid returns a special index that can never be inserted to
    /// 
    /// ----------- @return
    /// 
    /// the special constexpr index that can never be inserted to
    /// 
    #[inline]
    pub const fn invalid() -> u32 {
        u32::MAX
    }

    /// Mark bucket `n` as **discardable** – it may be overwritten later.
    ///
    /// allow_erase marks the element at index `n` as discardable. 
    ///
    /// Threadsafe without any concurrent insert.
    /// 
    /// ----------- @param n
    /// 
    /// the index to allow erasure of
    /// 
    #[inline]
    pub fn allow_erase(&self, n: u32) {
        self.collection_flags.bit_set(n);
        trace!(bucket = n, "allow_erase()");
    }

    /// Mark bucket `n` as **protected** – it must not be overwritten.
    ///
    /// please_keep marks the element at index `n` as an entry that should be
    /// kept.
    /// 
    /// Threadsafe without any concurrent insert.
    /// 
    /// ----------- @param n
    /// 
    /// the index to prioritize keeping
    /// 
    ////
    #[inline]
    pub fn please_keep(&self, n: u32) {
        self.collection_flags.bit_unset(n);
        trace!(bucket = n, "please_keep()");
    }

    /// Potentially advance epochs and perform lazy reclamation.
    ///
    /// epoch_check handles the changing of epochs for elements stored in the
    /// cache. epoch_check should be run before every insert.
    /// 
    /// First, epoch_check decrements and checks the cheap heuristic, and then
    /// does a more expensive scan if the cheap heuristic runs out. If the
    /// expensive scan succeeds, the epochs are aged and old elements are
    /// allow_erased. The cheap heuristic is reset to retrigger after the worst
    /// case growth of the current epoch's elements would exceed the epoch_size.
    /// 
    #[inline]
    fn epoch_check(&mut self) {
        if self.epoch_heuristic_counter > 0 {
            self.epoch_heuristic_counter -= 1;
            return;
        }

        // Count un‑erased entries from the newest epoch.
        let mut epoch_unused = 0_u32;
        for i in 0..self.size {
            if self.epoch_flags[i as usize] && !self.collection_flags.bit_is_set(i) {
                epoch_unused += 1;
            }
        }

        // If there are more non-deleted entries in the current epoch than the
        // epoch size, then allow_erase on all elements in the old epoch (marked
        // false) and move all elements in the current epoch to the old epoch
        // but do not call allow_erase on their indices.
        if epoch_unused >= self.epoch_size {
            // Demote current epoch & erase the previous one.
            for i in 0..self.size {
                if self.epoch_flags[i as usize] {
                    self.epoch_flags[i as usize] = false;
                } else {
                    self.allow_erase(i);
                }
            }
            self.epoch_heuristic_counter = self.epoch_size;
        } else {
            // Cheap heuristic before the next full scan.
            let min_scan = self.epoch_size / 16;
            let diff     = self.epoch_size - epoch_unused;

            // reset the epoch_heuristic_counter to next do a scan when worst
            // case behavior (no intermittent erases) would exceed epoch size,
            // with a reasonable minimum scan size.
            //
            // Ordinarily, we would have to sanity check std::min(epoch_size,
            // epoch_unused_count), but we already know that `epoch_unused_count
            // < epoch_size` in this branch
            self.epoch_heuristic_counter = cmp::max(1, cmp::max(min_scan, diff));
        }
        debug!(counter = self.epoch_heuristic_counter, "epoch_check() reset");
    }

    /// Prepare the cache to hold *at most* `new_size` elements.
    ///
    /// Returns the *actual* table size (≥ 2).
    ///
    /// setup initializes the container to store no more than new_size elements. setup should only
    /// be called once.
    /// 
    /// ----------- @param new_size
    /// 
    /// the desired number of elements to store
    /// 
    /// ----------- @return
    /// 
    /// the maximum number of elements storable
    /// 
    pub fn setup(&mut self, new_size: u32) -> u32 {

        // depth_limit must be at least one otherwise errors can occur.
        self.depth_limit = ((cmp::max(2, new_size) as f32).log2()) as u8;

        self.size        = cmp::max(2, new_size);

        self.table             = vec![None; self.size as usize];
        self.collection_flags  .setup(self.size);
        self.epoch_flags       = vec![false; self.size as usize];

        // Set to 45% as described above
        self.epoch_size             = cmp::max(1, (45 * self.size) / 100);

        // Initially set to wait for a whole epoch
        self.epoch_heuristic_counter = self.epoch_size;

        debug!(
            size         = self.size,
            depth_limit  = self.depth_limit,
            epoch_size   = self.epoch_size,
            "cache setup() completed"
        );
        self.size
    }

    /// Convenience wrapper translating *bytes* of storage into element count.
    ///
    /// setup_bytes is a convenience function which accounts for internal memory usage when
    /// deciding how many elements to store. It isn't perfect because it doesn't account for any
    /// overhead (struct size, MallocUsage, collection and epoch flags). This was done to simplify
    /// selecting a power of two size. In the expected use case, an extra two bits per entry should
    /// be negligible compared to the size of the elements.
    /// 
    /// ----------- @param bytes
    /// 
    /// the approximate number of bytes to use for this data structure
    /// 
    /// ----------- @return
    /// 
    /// the maximum number of elements storable (see setup() documentation for more detail)
    /// 
    #[inline]
    pub fn setup_bytes(&mut self, bytes: usize) -> u32 {
        let elem_sz = cmp::max(mem::size_of::<E>(), 1);
        self.setup((bytes / elem_sz) as u32)
    }

    /// Insert `e`, potentially evicting another entry according to the standard
    /// 8‑way Cuckoo algorithm with bounded search depth.
    ///
    /// insert loops at most depth_limit times trying to insert a hash at various locations in the
    /// table via a variant of the Cuckoo Algorithm with eight hash locations.
    /// 
    /// It drops the last tried element if it runs out of depth before encountering an open slot.
    /// 
    /// Thus:
    /// 
    /// ----------- @param e
    /// 
    /// the element to insert @post one of the following: All previously inserted elements and
    /// e are now in the table, one previously inserted element is evicted from the table, the
    /// entry attempted to be inserted is evicted.
    /// 
    /// ----------- @code
    /// 
    /// ```no-test
    /// insert(x);
    /// return contains(x, false);
    /// ```
    /// is not guaranteed to return true.
    /// 
    #[inline]
    pub fn insert(&mut self, mut e: E) {
        self.epoch_check();

        let mut last_loc  = Self::invalid();
        let mut last_epoch = true;
        let mut locs      = self.compute_hashes(&e);

        // Already present?
        //
        // Make sure we have not already inserted this element
        // If we have, make sure that it does not get deleted
        for &loc in &locs {
            if let Some(ref existing) = self.table[loc as usize] {
                if existing == &e {
                    self.please_keep(loc);
                    self.epoch_flags[loc as usize] = last_epoch;
                    return;
                }
            }
        }

        for _depth in 0..self.depth_limit {
            // First try to insert to an empty slot, if one exists
            for &loc in &locs {
                if self.collection_flags.bit_is_set(loc) {
                    self.table[loc as usize] = Some(e);
                    self.please_keep(loc);
                    self.epoch_flags[loc as usize] = last_epoch;
                    return;
                }
            }

            // Evict an entry – choose “next after last_loc” in `locs`.
            let idx = locs
                .iter()
                .position(|&x| x == last_loc)
                .map(|p| (p + 1) & 7)
                .unwrap_or(0);

            /* Swap with the element at the location that was
             * not the last one looked at. Example:
             *
             * 1. On first iteration, last_loc == invalid(), find returns last, so
             *    last_loc defaults to locs[0].
             * 2. On further iterations, where last_loc == locs[k], last_loc will
             *    go to locs[k+1 % 8], i.e., next of the 8 indices wrapping around
             *    to 0 if needed.
             *
             * This prevents moving the element we just put in.
             *
             * The swap is not a move -- we must switch onto the evicted element
             * for the next iteration.
             */
            last_loc = locs[idx];

            let slot           = &mut self.table[last_loc as usize];
            let mut victim     = slot.take().expect("victim must exist");
            mem::swap(&mut victim, &mut e); // move victim into `e`
            *slot = Some(victim);

            let epoch_current = last_epoch;
            last_epoch        = self.epoch_flags[last_loc as usize];
            self.epoch_flags[last_loc as usize] = epoch_current;

            // Re‑hash the newly evicted element.
            //
            // Recompute the locs -- unfortunately happens one too many times!
            locs = self.compute_hashes(&e);
        }
        // Dropped element `e` silently falls off the cache.
        trace!("insert() evicted element after depth_limit");
    }

    /// Check whether `e` is present, optionally marking its bucket discardable.
    ///
    /// contains iterates through the hash locations for a given element and checks to see if it
    /// is present. contains does not check garbage collected state (in other words, garbage is
    /// only collected when the space is needed), so:
    /// 
    /// -----------
    /// @param e
    /// 
    /// the element to check
    /// ----------
    /// @param erase
    /// 
    /// whether to attempt setting the garbage collect flag @post if erase is true and the
    /// element is found, then the garbage collect flag is set
    /// 
    /// -----------
    /// @code
    /// 
    /// ```no-test
    /// insert(x);
    /// if (contains(x, true))
    ///     return contains(x, false);
    /// else
    ///     return true;
    /// ```
    /// executed on a single thread will always return true!
    /// 
    /// This is a great property for re-org performance for example. contains returns a bool set
    /// true if the element was found.
    /// 
    /// -----------
    /// @return
    /// 
    /// true if the element is found, false otherwise
    /// 
    #[inline]
    pub fn contains(&self, e: &E, erase: bool) -> bool {
        for &loc in &self.compute_hashes(e) {
            if let Some(ref existing) = self.table[loc as usize] {
                if existing == e {
                    if erase {
                        self.allow_erase(loc);
                    }
                    return true;
                }
            }
        }
        false
    }
}

#[cfg(test)]
mod cuckoo_cache_suite {
    use super::*;

    /// A trivial eight‑way hasher for `u32` elements.
    #[derive(Default)]
    struct TrivialHasher;

    impl EightWayHasher<u32> for TrivialHasher {
        fn hashes(&self, e: &u32) -> [u32; 8] {
            let mut h = [0u32; 8];
            for i in 0..8 {
                h[i] = e.wrapping_add(i as u32).wrapping_mul(0x9E37_79B9);
            }
            h
        }
    }

    fn fresh_cache(cap: u32) -> Cache<u32, TrivialHasher> {
        let mut c: Cache<u32, TrivialHasher> = Cache::default();
        c.setup(cap);
        c
    }

    #[traced_test]
    fn setup_establishes_parameters() {
        let mut c = Cache::<u32, TrivialHasher>::default();
        let sz = c.setup(100);
        assert_eq!(sz, *c.size());
        assert!(*c.depth_limit() > 0);
        assert_eq!(c.collection_flags().mem().len(), ((sz + 7) / 8) as usize);
    }

    #[traced_test]
    fn insert_and_contains_roundtrip() {
        let mut c = fresh_cache(128);
        for i in 0..50u32 {
            c.insert(i);
        }
        for i in 0..50u32 {
            assert!(c.contains(&i, false), "cache should contain {i}");
        }
    }

    /// Erasure marks a bucket *discardable* but does **not** remove the element
    /// until a later insertion needs the slot.  This test follows the contract
    /// spelled out in the original C++ comments.
    #[traced_test]
    fn erase_mechanism_works() {
        let mut c = fresh_cache(32);
        c.insert(42);

        // First lookup sets the GC flag.
        assert!(c.contains(&42, true), "first lookup marks for erase");

        // The element is still discoverable until the slot is reused.
        assert!(c.contains(&42, false), "element must remain until reclaimed");

        // Push enough additional elements to *possibly* reclaim the slot.
        for i in 0..64u32 {
            if i != 42 {
                c.insert(i);
            }
        }
        // At this point `42` may or may not still be present; the contract places
        // no guarantee once further inserts occur, so we make no assertion.
    }
}