mini_moka/common/frequency_sketch.rs
1// License and Copyright Notice:
2//
3// Some of the code and doc comments in this module were ported or copied from
4// a Java class `com.github.benmanes.caffeine.cache.FrequencySketch` of Caffeine.
5// https://github.com/ben-manes/caffeine/blob/master/caffeine/src/main/java/com/github/benmanes/caffeine/cache/FrequencySketch.java
6//
7// The original code/comments from Caffeine are licensed under the Apache License,
8// Version 2.0 <https://github.com/ben-manes/caffeine/blob/master/LICENSE>
9//
10// Copyrights of the original code/comments are retained by their contributors.
11// For full authorship information, see the version control history of
12// https://github.com/ben-manes/caffeine/
13
14/// A probabilistic multi-set for estimating the popularity of an element within
15/// a time window. The maximum frequency of an element is limited to 15 (4-bits)
16/// and an aging process periodically halves the popularity of all elements.
17#[derive(Default)]
18pub(crate) struct FrequencySketch {
19 sample_size: u32,
20 table_mask: u64,
21 table: Box<[u64]>,
22 size: u32,
23}
24
25// A mixture of seeds from FNV-1a, CityHash, and Murmur3. (Taken from Caffeine)
26static SEED: [u64; 4] = [
27 0xc3a5_c85c_97cb_3127,
28 0xb492_b66f_be98_f273,
29 0x9ae1_6a3b_2f90_404f,
30 0xcbf2_9ce4_8422_2325,
31];
32
33static RESET_MASK: u64 = 0x7777_7777_7777_7777;
34
35static ONE_MASK: u64 = 0x1111_1111_1111_1111;
36
37// -------------------------------------------------------------------------------
38// Some of the code and doc comments in this module were ported or copied from
39// a Java class `com.github.benmanes.caffeine.cache.FrequencySketch` of Caffeine.
40// https://github.com/ben-manes/caffeine/blob/master/caffeine/src/main/java/com/github/benmanes/caffeine/cache/FrequencySketch.java
41// -------------------------------------------------------------------------------
42//
43// FrequencySketch maintains a 4-bit CountMinSketch [1] with periodic aging to
44// provide the popularity history for the TinyLfu admission policy [2].
45// The time and space efficiency of the sketch allows it to cheaply estimate the
46// frequency of an entry in a stream of cache access events.
47//
48// The counter matrix is represented as a single dimensional array holding 16
49// counters per slot. A fixed depth of four balances the accuracy and cost,
50// resulting in a width of four times the length of the array. To retain an
51// accurate estimation the array's length equals the maximum number of entries
52// in the cache, increased to the closest power-of-two to exploit more efficient
53// bit masking. This configuration results in a confidence of 93.75% and error
54// bound of e / width.
55//
56// The frequency of all entries is aged periodically using a sampling window
57// based on the maximum number of entries in the cache. This is referred to as
58// the reset operation by TinyLfu and keeps the sketch fresh by dividing all
59// counters by two and subtracting based on the number of odd counters
60// found. The O(n) cost of aging is amortized, ideal for hardware pre-fetching,
61// and uses inexpensive bit manipulations per array location.
62//
63// [1] An Improved Data Stream Summary: The Count-Min Sketch and its Applications
64// http://dimacs.rutgers.edu/~graham/pubs/papers/cm-full.pdf
65// [2] TinyLFU: A Highly Efficient Cache Admission Policy
66// https://dl.acm.org/citation.cfm?id=3149371
67//
68// -------------------------------------------------------------------------------
69
70impl FrequencySketch {
71 /// Initializes and increases the capacity of this `FrequencySketch` instance,
72 /// if necessary, to ensure that it can accurately estimate the popularity of
73 /// elements given the maximum size of the cache. This operation forgets all
74 /// previous counts when resizing.
75 pub(crate) fn ensure_capacity(&mut self, cap: u32) {
76 // The max byte size of the table, Box<[u64; table_size]>
77 //
78 // | Pointer width | Max size |
79 // |:-----------------|---------:|
80 // | 16 bit | 8 KiB |
81 // | 32 bit | 128 MiB |
82 // | 64 bit or bigger | 8 GiB |
83
84 let maximum = if cfg!(target_pointer_width = "16") {
85 cap.min(1024)
86 } else if cfg!(target_pointer_width = "32") {
87 cap.min(2u32.pow(24)) // about 16 millions
88 } else {
89 // Same to Caffeine's limit:
90 // `Integer.MAX_VALUE >>> 1` with `ceilingPowerOfTwo()` applied.
91 cap.min(2u32.pow(30)) // about 1 billion
92 };
93 let table_size = if maximum == 0 {
94 1
95 } else {
96 maximum.next_power_of_two()
97 };
98
99 if self.table.len() as u32 >= table_size {
100 return;
101 }
102
103 self.table = vec![0; table_size as usize].into_boxed_slice();
104 self.table_mask = 0.max(table_size - 1) as u64;
105 self.sample_size = if cap == 0 {
106 10
107 } else {
108 maximum.saturating_mul(10).min(i32::MAX as u32)
109 };
110 }
111
112 /// Takes the hash value of an element, and returns the estimated number of
113 /// occurrences of the element, up to the maximum (15).
114 pub(crate) fn frequency(&self, hash: u64) -> u8 {
115 if self.table.is_empty() {
116 return 0;
117 }
118
119 let start = ((hash & 3) << 2) as u8;
120 let mut frequency = std::u8::MAX;
121 for i in 0..4 {
122 let index = self.index_of(hash, i);
123 let count = (self.table[index] >> ((start + i) << 2) & 0xF) as u8;
124 frequency = frequency.min(count);
125 }
126 frequency
127 }
128
129 /// Take a hash value of an element and increments the popularity of the
130 /// element if it does not exceed the maximum (15). The popularity of all
131 /// elements will be periodically down sampled when the observed events
132 /// exceeds a threshold. This process provides a frequency aging to allow
133 /// expired long term entries to fade away.
134 pub(crate) fn increment(&mut self, hash: u64) {
135 if self.table.is_empty() {
136 return;
137 }
138
139 let start = ((hash & 3) << 2) as u8;
140 let mut added = false;
141 for i in 0..4 {
142 let index = self.index_of(hash, i);
143 added |= self.increment_at(index, start + i);
144 }
145
146 if added {
147 self.size += 1;
148 if self.size >= self.sample_size {
149 self.reset();
150 }
151 }
152 }
153
154 /// Takes a table index (each entry has 16 counters) and counter index, and
155 /// increments the counter by 1 if it is not already at the maximum value
156 /// (15). Returns `true` if incremented.
157 fn increment_at(&mut self, table_index: usize, counter_index: u8) -> bool {
158 let offset = (counter_index as usize) << 2;
159 let mask = 0xF_u64 << offset;
160 if self.table[table_index] & mask != mask {
161 self.table[table_index] += 1u64 << offset;
162 true
163 } else {
164 false
165 }
166 }
167
168 /// Reduces every counter by half of its original value.
169 fn reset(&mut self) {
170 let mut count = 0u32;
171 for entry in self.table.iter_mut() {
172 // Count number of odd numbers.
173 count += (*entry & ONE_MASK).count_ones();
174 *entry = (*entry >> 1) & RESET_MASK;
175 }
176 self.size = (self.size >> 1) - (count >> 2);
177 }
178
179 /// Returns the table index for the counter at the specified depth.
180 fn index_of(&self, hash: u64, depth: u8) -> usize {
181 let i = depth as usize;
182 let mut hash = hash.wrapping_add(SEED[i]).wrapping_mul(SEED[i]);
183 hash = hash.wrapping_add(hash >> 32);
184 (hash & self.table_mask) as usize
185 }
186}
187
188// Methods only available for testing.
189#[cfg(test)]
190impl FrequencySketch {
191 pub(crate) fn table_len(&self) -> usize {
192 self.table.len()
193 }
194}
195
196// Some test cases were ported from Caffeine at:
197// https://github.com/ben-manes/caffeine/blob/master/caffeine/src/test/java/com/github/benmanes/caffeine/cache/FrequencySketchTest.java
198//
199// To see the debug prints, run test as `cargo test -- --nocapture`
200#[cfg(test)]
201mod tests {
202 use super::FrequencySketch;
203 use once_cell::sync::Lazy;
204 use std::hash::{BuildHasher, Hash, Hasher};
205
206 static ITEM: Lazy<u32> = Lazy::new(|| {
207 let mut buf = [0; 4];
208 getrandom::getrandom(&mut buf).unwrap();
209 unsafe { std::mem::transmute::<[u8; 4], u32>(buf) }
210 });
211
212 // This test was ported from Caffeine.
213 #[test]
214 fn increment_once() {
215 let mut sketch = FrequencySketch::default();
216 sketch.ensure_capacity(512);
217 let hasher = hasher();
218 let item_hash = hasher(*ITEM);
219 sketch.increment(item_hash);
220 assert_eq!(sketch.frequency(item_hash), 1);
221 }
222
223 // This test was ported from Caffeine.
224 #[test]
225 fn increment_max() {
226 let mut sketch = FrequencySketch::default();
227 sketch.ensure_capacity(512);
228 let hasher = hasher();
229 let item_hash = hasher(*ITEM);
230 for _ in 0..20 {
231 sketch.increment(item_hash);
232 }
233 assert_eq!(sketch.frequency(item_hash), 15);
234 }
235
236 // This test was ported from Caffeine.
237 #[test]
238 fn increment_distinct() {
239 let mut sketch = FrequencySketch::default();
240 sketch.ensure_capacity(512);
241 let hasher = hasher();
242 sketch.increment(hasher(*ITEM));
243 sketch.increment(hasher(ITEM.wrapping_add(1)));
244 assert_eq!(sketch.frequency(hasher(*ITEM)), 1);
245 assert_eq!(sketch.frequency(hasher(ITEM.wrapping_add(1))), 1);
246 assert_eq!(sketch.frequency(hasher(ITEM.wrapping_add(2))), 0);
247 }
248
249 // This test was ported from Caffeine.
250 #[test]
251 fn index_of_around_zero() {
252 let mut sketch = FrequencySketch::default();
253 sketch.ensure_capacity(512);
254 let mut indexes = std::collections::HashSet::new();
255 let hashes = [std::u64::MAX, 0, 1];
256 for hash in hashes.iter() {
257 for depth in 0..4 {
258 indexes.insert(sketch.index_of(*hash, depth));
259 }
260 }
261 assert_eq!(indexes.len(), 4 * hashes.len())
262 }
263
264 // This test was ported from Caffeine.
265 #[test]
266 fn reset() {
267 let mut reset = false;
268 let mut sketch = FrequencySketch::default();
269 sketch.ensure_capacity(64);
270 let hasher = hasher();
271
272 for i in 1..(20 * sketch.table.len() as u32) {
273 sketch.increment(hasher(i));
274 if sketch.size != i {
275 reset = true;
276 break;
277 }
278 }
279
280 assert!(reset);
281 assert!(sketch.size <= sketch.sample_size / 2);
282 }
283
284 // This test was ported from Caffeine.
285 #[test]
286 fn heavy_hitters() {
287 let mut sketch = FrequencySketch::default();
288 sketch.ensure_capacity(65_536);
289 let hasher = hasher();
290
291 for i in 100..100_000 {
292 sketch.increment(hasher(i));
293 }
294
295 for i in (0..10).step_by(2) {
296 for _ in 0..i {
297 sketch.increment(hasher(i));
298 }
299 }
300
301 // A perfect popularity count yields an array [0, 0, 2, 0, 4, 0, 6, 0, 8, 0]
302 let popularity = (0..10)
303 .map(|i| sketch.frequency(hasher(i)))
304 .collect::<Vec<_>>();
305
306 for (i, freq) in popularity.iter().enumerate() {
307 match i {
308 2 => assert!(freq <= &popularity[4]),
309 4 => assert!(freq <= &popularity[6]),
310 6 => assert!(freq <= &popularity[8]),
311 8 => (),
312 _ => assert!(freq <= &popularity[2]),
313 }
314 }
315 }
316
317 fn hasher<K: Hash>() -> impl Fn(K) -> u64 {
318 let build_hasher = std::collections::hash_map::RandomState::default();
319 move |key| {
320 let mut hasher = build_hasher.build_hasher();
321 key.hash(&mut hasher);
322 hasher.finish()
323 }
324 }
325}
326
327// Verify that some properties hold such as no panic occurs on any possible inputs.
328#[cfg(kani)]
329mod kani {
330 use super::FrequencySketch;
331
332 const CAPACITIES: &[u32] = &[
333 0,
334 1,
335 1024,
336 1025,
337 2u32.pow(24),
338 2u32.pow(24) + 1,
339 2u32.pow(30),
340 2u32.pow(30) + 1,
341 u32::MAX,
342 ];
343
344 #[kani::proof]
345 fn verify_ensure_capacity() {
346 // Check for arbitrary capacities.
347 let capacity = kani::any();
348 let mut sketch = FrequencySketch::default();
349 sketch.ensure_capacity(capacity);
350 }
351
352 #[kani::proof]
353 fn verify_frequency() {
354 // Check for some selected capacities.
355 for capacity in CAPACITIES {
356 let mut sketch = FrequencySketch::default();
357 sketch.ensure_capacity(*capacity);
358
359 // Check for arbitrary hashes.
360 let hash = kani::any();
361 let frequency = sketch.frequency(hash);
362 assert!(frequency <= 15);
363 }
364 }
365
366 #[kani::proof]
367 fn verify_increment() {
368 // Only check for small capacities. Because Kani Rust Verifier is a model
369 // checking tool, it will take much longer time (exponential) to check larger
370 // capacities here.
371 for capacity in &[0, 1, 128] {
372 let mut sketch = FrequencySketch::default();
373 sketch.ensure_capacity(*capacity);
374
375 // Check for arbitrary hashes.
376 let hash = kani::any();
377 sketch.increment(hash);
378 }
379 }
380
381 #[kani::proof]
382 fn verify_index_of() {
383 // Check for arbitrary capacities.
384 let capacity = kani::any();
385 let mut sketch = FrequencySketch::default();
386 sketch.ensure_capacity(capacity);
387
388 // Check for arbitrary hashes.
389 let hash = kani::any();
390 for i in 0..4 {
391 let index = sketch.index_of(hash, i);
392 assert!(index < sketch.table.len());
393 }
394 }
395}