mini_moka/common/
frequency_sketch.rs

1// License and Copyright Notice:
2//
3// Some of the code and doc comments in this module were ported or copied from
4// a Java class `com.github.benmanes.caffeine.cache.FrequencySketch` of Caffeine.
5// https://github.com/ben-manes/caffeine/blob/master/caffeine/src/main/java/com/github/benmanes/caffeine/cache/FrequencySketch.java
6//
7// The original code/comments from Caffeine are licensed under the Apache License,
8// Version 2.0 <https://github.com/ben-manes/caffeine/blob/master/LICENSE>
9//
10// Copyrights of the original code/comments are retained by their contributors.
11// For full authorship information, see the version control history of
12// https://github.com/ben-manes/caffeine/
13
14/// A probabilistic multi-set for estimating the popularity of an element within
15/// a time window. The maximum frequency of an element is limited to 15 (4-bits)
16/// and an aging process periodically halves the popularity of all elements.
17#[derive(Default)]
18pub(crate) struct FrequencySketch {
19    sample_size: u32,
20    table_mask: u64,
21    table: Box<[u64]>,
22    size: u32,
23}
24
25// A mixture of seeds from FNV-1a, CityHash, and Murmur3. (Taken from Caffeine)
26static SEED: [u64; 4] = [
27    0xc3a5_c85c_97cb_3127,
28    0xb492_b66f_be98_f273,
29    0x9ae1_6a3b_2f90_404f,
30    0xcbf2_9ce4_8422_2325,
31];
32
33static RESET_MASK: u64 = 0x7777_7777_7777_7777;
34
35static ONE_MASK: u64 = 0x1111_1111_1111_1111;
36
37// -------------------------------------------------------------------------------
38// Some of the code and doc comments in this module were ported or copied from
39// a Java class `com.github.benmanes.caffeine.cache.FrequencySketch` of Caffeine.
40// https://github.com/ben-manes/caffeine/blob/master/caffeine/src/main/java/com/github/benmanes/caffeine/cache/FrequencySketch.java
41// -------------------------------------------------------------------------------
42//
43// FrequencySketch maintains a 4-bit CountMinSketch [1] with periodic aging to
44// provide the popularity history for the TinyLfu admission policy [2].
45// The time and space efficiency of the sketch allows it to cheaply estimate the
46// frequency of an entry in a stream of cache access events.
47//
48// The counter matrix is represented as a single dimensional array holding 16
49// counters per slot. A fixed depth of four balances the accuracy and cost,
50// resulting in a width of four times the length of the array. To retain an
51// accurate estimation the array's length equals the maximum number of entries
52// in the cache, increased to the closest power-of-two to exploit more efficient
53// bit masking. This configuration results in a confidence of 93.75% and error
54// bound of e / width.
55//
56// The frequency of all entries is aged periodically using a sampling window
57// based on the maximum number of entries in the cache. This is referred to as
58// the reset operation by TinyLfu and keeps the sketch fresh by dividing all
59// counters by two and subtracting based on the number of odd counters
60// found. The O(n) cost of aging is amortized, ideal for hardware pre-fetching,
61// and uses inexpensive bit manipulations per array location.
62//
63// [1] An Improved Data Stream Summary: The Count-Min Sketch and its Applications
64//     http://dimacs.rutgers.edu/~graham/pubs/papers/cm-full.pdf
65// [2] TinyLFU: A Highly Efficient Cache Admission Policy
66//     https://dl.acm.org/citation.cfm?id=3149371
67//
68// -------------------------------------------------------------------------------
69
70impl FrequencySketch {
71    /// Initializes and increases the capacity of this `FrequencySketch` instance,
72    /// if necessary, to ensure that it can accurately estimate the popularity of
73    /// elements given the maximum size of the cache. This operation forgets all
74    /// previous counts when resizing.
75    pub(crate) fn ensure_capacity(&mut self, cap: u32) {
76        // The max byte size of the table, Box<[u64; table_size]>
77        //
78        // | Pointer width    | Max size |
79        // |:-----------------|---------:|
80        // | 16 bit           |    8 KiB |
81        // | 32 bit           |  128 MiB |
82        // | 64 bit or bigger |    8 GiB |
83
84        let maximum = if cfg!(target_pointer_width = "16") {
85            cap.min(1024)
86        } else if cfg!(target_pointer_width = "32") {
87            cap.min(2u32.pow(24)) // about 16 millions
88        } else {
89            // Same to Caffeine's limit:
90            //   `Integer.MAX_VALUE >>> 1` with `ceilingPowerOfTwo()` applied.
91            cap.min(2u32.pow(30)) // about 1 billion
92        };
93        let table_size = if maximum == 0 {
94            1
95        } else {
96            maximum.next_power_of_two()
97        };
98
99        if self.table.len() as u32 >= table_size {
100            return;
101        }
102
103        self.table = vec![0; table_size as usize].into_boxed_slice();
104        self.table_mask = 0.max(table_size - 1) as u64;
105        self.sample_size = if cap == 0 {
106            10
107        } else {
108            maximum.saturating_mul(10).min(i32::MAX as u32)
109        };
110    }
111
112    /// Takes the hash value of an element, and returns the estimated number of
113    /// occurrences of the element, up to the maximum (15).
114    pub(crate) fn frequency(&self, hash: u64) -> u8 {
115        if self.table.is_empty() {
116            return 0;
117        }
118
119        let start = ((hash & 3) << 2) as u8;
120        let mut frequency = std::u8::MAX;
121        for i in 0..4 {
122            let index = self.index_of(hash, i);
123            let count = (self.table[index] >> ((start + i) << 2) & 0xF) as u8;
124            frequency = frequency.min(count);
125        }
126        frequency
127    }
128
129    /// Take a hash value of an element and increments the popularity of the
130    /// element if it does not exceed the maximum (15). The popularity of all
131    /// elements will be periodically down sampled when the observed events
132    /// exceeds a threshold. This process provides a frequency aging to allow
133    /// expired long term entries to fade away.
134    pub(crate) fn increment(&mut self, hash: u64) {
135        if self.table.is_empty() {
136            return;
137        }
138
139        let start = ((hash & 3) << 2) as u8;
140        let mut added = false;
141        for i in 0..4 {
142            let index = self.index_of(hash, i);
143            added |= self.increment_at(index, start + i);
144        }
145
146        if added {
147            self.size += 1;
148            if self.size >= self.sample_size {
149                self.reset();
150            }
151        }
152    }
153
154    /// Takes a table index (each entry has 16 counters) and counter index, and
155    /// increments the counter by 1 if it is not already at the maximum value
156    /// (15). Returns `true` if incremented.
157    fn increment_at(&mut self, table_index: usize, counter_index: u8) -> bool {
158        let offset = (counter_index as usize) << 2;
159        let mask = 0xF_u64 << offset;
160        if self.table[table_index] & mask != mask {
161            self.table[table_index] += 1u64 << offset;
162            true
163        } else {
164            false
165        }
166    }
167
168    /// Reduces every counter by half of its original value.
169    fn reset(&mut self) {
170        let mut count = 0u32;
171        for entry in self.table.iter_mut() {
172            // Count number of odd numbers.
173            count += (*entry & ONE_MASK).count_ones();
174            *entry = (*entry >> 1) & RESET_MASK;
175        }
176        self.size = (self.size >> 1) - (count >> 2);
177    }
178
179    /// Returns the table index for the counter at the specified depth.
180    fn index_of(&self, hash: u64, depth: u8) -> usize {
181        let i = depth as usize;
182        let mut hash = hash.wrapping_add(SEED[i]).wrapping_mul(SEED[i]);
183        hash = hash.wrapping_add(hash >> 32);
184        (hash & self.table_mask) as usize
185    }
186}
187
188// Methods only available for testing.
189#[cfg(test)]
190impl FrequencySketch {
191    pub(crate) fn table_len(&self) -> usize {
192        self.table.len()
193    }
194}
195
196// Some test cases were ported from Caffeine at:
197// https://github.com/ben-manes/caffeine/blob/master/caffeine/src/test/java/com/github/benmanes/caffeine/cache/FrequencySketchTest.java
198//
199// To see the debug prints, run test as `cargo test -- --nocapture`
200#[cfg(test)]
201mod tests {
202    use super::FrequencySketch;
203    use once_cell::sync::Lazy;
204    use std::hash::{BuildHasher, Hash, Hasher};
205
206    static ITEM: Lazy<u32> = Lazy::new(|| {
207        let mut buf = [0; 4];
208        getrandom::getrandom(&mut buf).unwrap();
209        unsafe { std::mem::transmute::<[u8; 4], u32>(buf) }
210    });
211
212    // This test was ported from Caffeine.
213    #[test]
214    fn increment_once() {
215        let mut sketch = FrequencySketch::default();
216        sketch.ensure_capacity(512);
217        let hasher = hasher();
218        let item_hash = hasher(*ITEM);
219        sketch.increment(item_hash);
220        assert_eq!(sketch.frequency(item_hash), 1);
221    }
222
223    // This test was ported from Caffeine.
224    #[test]
225    fn increment_max() {
226        let mut sketch = FrequencySketch::default();
227        sketch.ensure_capacity(512);
228        let hasher = hasher();
229        let item_hash = hasher(*ITEM);
230        for _ in 0..20 {
231            sketch.increment(item_hash);
232        }
233        assert_eq!(sketch.frequency(item_hash), 15);
234    }
235
236    // This test was ported from Caffeine.
237    #[test]
238    fn increment_distinct() {
239        let mut sketch = FrequencySketch::default();
240        sketch.ensure_capacity(512);
241        let hasher = hasher();
242        sketch.increment(hasher(*ITEM));
243        sketch.increment(hasher(ITEM.wrapping_add(1)));
244        assert_eq!(sketch.frequency(hasher(*ITEM)), 1);
245        assert_eq!(sketch.frequency(hasher(ITEM.wrapping_add(1))), 1);
246        assert_eq!(sketch.frequency(hasher(ITEM.wrapping_add(2))), 0);
247    }
248
249    // This test was ported from Caffeine.
250    #[test]
251    fn index_of_around_zero() {
252        let mut sketch = FrequencySketch::default();
253        sketch.ensure_capacity(512);
254        let mut indexes = std::collections::HashSet::new();
255        let hashes = [std::u64::MAX, 0, 1];
256        for hash in hashes.iter() {
257            for depth in 0..4 {
258                indexes.insert(sketch.index_of(*hash, depth));
259            }
260        }
261        assert_eq!(indexes.len(), 4 * hashes.len())
262    }
263
264    // This test was ported from Caffeine.
265    #[test]
266    fn reset() {
267        let mut reset = false;
268        let mut sketch = FrequencySketch::default();
269        sketch.ensure_capacity(64);
270        let hasher = hasher();
271
272        for i in 1..(20 * sketch.table.len() as u32) {
273            sketch.increment(hasher(i));
274            if sketch.size != i {
275                reset = true;
276                break;
277            }
278        }
279
280        assert!(reset);
281        assert!(sketch.size <= sketch.sample_size / 2);
282    }
283
284    // This test was ported from Caffeine.
285    #[test]
286    fn heavy_hitters() {
287        let mut sketch = FrequencySketch::default();
288        sketch.ensure_capacity(65_536);
289        let hasher = hasher();
290
291        for i in 100..100_000 {
292            sketch.increment(hasher(i));
293        }
294
295        for i in (0..10).step_by(2) {
296            for _ in 0..i {
297                sketch.increment(hasher(i));
298            }
299        }
300
301        // A perfect popularity count yields an array [0, 0, 2, 0, 4, 0, 6, 0, 8, 0]
302        let popularity = (0..10)
303            .map(|i| sketch.frequency(hasher(i)))
304            .collect::<Vec<_>>();
305
306        for (i, freq) in popularity.iter().enumerate() {
307            match i {
308                2 => assert!(freq <= &popularity[4]),
309                4 => assert!(freq <= &popularity[6]),
310                6 => assert!(freq <= &popularity[8]),
311                8 => (),
312                _ => assert!(freq <= &popularity[2]),
313            }
314        }
315    }
316
317    fn hasher<K: Hash>() -> impl Fn(K) -> u64 {
318        let build_hasher = std::collections::hash_map::RandomState::default();
319        move |key| {
320            let mut hasher = build_hasher.build_hasher();
321            key.hash(&mut hasher);
322            hasher.finish()
323        }
324    }
325}
326
327// Verify that some properties hold such as no panic occurs on any possible inputs.
328#[cfg(kani)]
329mod kani {
330    use super::FrequencySketch;
331
332    const CAPACITIES: &[u32] = &[
333        0,
334        1,
335        1024,
336        1025,
337        2u32.pow(24),
338        2u32.pow(24) + 1,
339        2u32.pow(30),
340        2u32.pow(30) + 1,
341        u32::MAX,
342    ];
343
344    #[kani::proof]
345    fn verify_ensure_capacity() {
346        // Check for arbitrary capacities.
347        let capacity = kani::any();
348        let mut sketch = FrequencySketch::default();
349        sketch.ensure_capacity(capacity);
350    }
351
352    #[kani::proof]
353    fn verify_frequency() {
354        // Check for some selected capacities.
355        for capacity in CAPACITIES {
356            let mut sketch = FrequencySketch::default();
357            sketch.ensure_capacity(*capacity);
358
359            // Check for arbitrary hashes.
360            let hash = kani::any();
361            let frequency = sketch.frequency(hash);
362            assert!(frequency <= 15);
363        }
364    }
365
366    #[kani::proof]
367    fn verify_increment() {
368        // Only check for small capacities. Because Kani Rust Verifier is a model
369        // checking tool, it will take much longer time (exponential) to check larger
370        // capacities here.
371        for capacity in &[0, 1, 128] {
372            let mut sketch = FrequencySketch::default();
373            sketch.ensure_capacity(*capacity);
374
375            // Check for arbitrary hashes.
376            let hash = kani::any();
377            sketch.increment(hash);
378        }
379    }
380
381    #[kani::proof]
382    fn verify_index_of() {
383        // Check for arbitrary capacities.
384        let capacity = kani::any();
385        let mut sketch = FrequencySketch::default();
386        sketch.ensure_capacity(capacity);
387
388        // Check for arbitrary hashes.
389        let hash = kani::any();
390        for i in 0..4 {
391            let index = sketch.index_of(hash, i);
392            assert!(index < sketch.table.len());
393        }
394    }
395}