Skip to main content

nodedb_types/approx/
spacesaving.rs

1//! SpaceSaving — approximate top-K heavy hitters (bounded memory).
2
3use std::collections::HashMap;
4
5/// Space-saving algorithm for approximate top-K heavy hitters.
6///
7/// Tracks the K most frequent items with bounded memory. Items not in
8/// the top K are approximated — their counts may be over-estimated by
9/// at most the minimum count in the structure.
10#[derive(Debug)]
11pub struct SpaceSaving {
12    items: HashMap<u64, (u64, u64)>,
13    max_items: usize,
14}
15
16impl SpaceSaving {
17    pub fn new(k: usize) -> Self {
18        Self {
19            items: HashMap::with_capacity(k + 1),
20            max_items: k.max(1),
21        }
22    }
23
24    pub fn add(&mut self, item: u64) {
25        if let Some(entry) = self.items.get_mut(&item) {
26            entry.0 += 1;
27            return;
28        }
29
30        if self.items.len() < self.max_items {
31            self.items.insert(item, (1, 0));
32        } else {
33            let Some((&min_key, &(min_count, _))) =
34                self.items.iter().min_by_key(|(_, (count, _))| *count)
35            else {
36                return;
37            };
38            self.items.remove(&min_key);
39            self.items.insert(item, (min_count + 1, min_count));
40        }
41    }
42
43    pub fn add_batch(&mut self, items: &[u64]) {
44        for &item in items {
45            self.add(item);
46        }
47    }
48
49    /// Get the top-K items sorted by count (descending).
50    ///
51    /// Returns `(item, count, error_bound)` tuples.
52    pub fn top_k(&self) -> Vec<(u64, u64, u64)> {
53        let mut result: Vec<(u64, u64, u64)> = self
54            .items
55            .iter()
56            .map(|(&item, &(count, error))| (item, count, error))
57            .collect();
58        result.sort_by_key(|item| std::cmp::Reverse(item.1));
59        result
60    }
61
62    pub fn merge(&mut self, other: &SpaceSaving) {
63        for (&item, &(count, error)) in &other.items {
64            let entry = self.items.entry(item).or_insert((0, 0));
65            entry.0 += count;
66            entry.1 += error;
67        }
68
69        while self.items.len() > self.max_items {
70            let Some((&min_key, _)) = self.items.iter().min_by_key(|(_, (count, _))| *count) else {
71                break;
72            };
73            self.items.remove(&min_key);
74        }
75    }
76}
77
78#[cfg(test)]
79mod tests {
80    use super::*;
81
82    #[test]
83    fn topk_basic() {
84        let mut ss = SpaceSaving::new(3);
85        for _ in 0..100 {
86            ss.add(1);
87        }
88        for _ in 0..50 {
89            ss.add(2);
90        }
91        for _ in 0..30 {
92            ss.add(3);
93        }
94        for _ in 0..10 {
95            ss.add(4);
96        }
97        let top = ss.top_k();
98        assert_eq!(top[0].0, 1);
99        assert_eq!(top[0].1, 100);
100    }
101
102    #[test]
103    fn topk_merge() {
104        let mut a = SpaceSaving::new(5);
105        let mut b = SpaceSaving::new(5);
106        for _ in 0..100 {
107            a.add(1);
108        }
109        for _ in 0..80 {
110            b.add(1);
111        }
112        for _ in 0..50 {
113            b.add(2);
114        }
115        a.merge(&b);
116        let top = a.top_k();
117        assert_eq!(top[0].0, 1);
118        assert_eq!(top[0].1, 180);
119    }
120
121    #[test]
122    fn topk_eviction() {
123        let mut ss = SpaceSaving::new(3);
124        for i in 0..10u64 {
125            for _ in 0..(10 - i) {
126                ss.add(i);
127            }
128        }
129        let top = ss.top_k();
130        assert_eq!(top.len(), 3);
131        assert!(top[0].1 >= top[1].1);
132        assert!(top[1].1 >= top[2].1);
133    }
134}