Skip to main content

nodedb_types/approx/
hll.rs

1//! HyperLogLog — approximate count distinct (12 KB memory, ~0.8% error).
2
3/// HyperLogLog cardinality estimator.
4///
5/// Uses 2^14 = 16384 registers (12 KB memory). Achieves ~0.8% relative
6/// error at any cardinality. Mergeable: `hll_a.merge(&hll_b)` produces
7/// the union cardinality.
8#[derive(Debug)]
9pub struct HyperLogLog {
10    registers: Vec<u8>,
11    precision: u8,
12}
13
14impl HyperLogLog {
15    pub fn new() -> Self {
16        Self::with_precision(14)
17    }
18
19    pub fn with_precision(p: u8) -> Self {
20        let p = p.clamp(4, 18);
21        let m = 1usize << p;
22        Self {
23            registers: vec![0u8; m],
24            precision: p,
25        }
26    }
27
28    pub fn add(&mut self, value: u64) {
29        let hash = splitmix64(value);
30        let m = self.registers.len();
31        let idx = (hash as usize) & (m - 1);
32        let remaining = hash >> self.precision;
33        let leading_zeros = if remaining == 0 {
34            (64 - self.precision) + 1
35        } else {
36            (remaining.leading_zeros() as u8).saturating_sub(self.precision) + 1
37        };
38        if leading_zeros > self.registers[idx] {
39            self.registers[idx] = leading_zeros;
40        }
41    }
42
43    pub fn add_batch(&mut self, values: &[u64]) {
44        for &v in values {
45            self.add(v);
46        }
47    }
48
49    pub fn add_f64_batch(&mut self, values: &[f64]) {
50        for &v in values {
51            self.add(v.to_bits());
52        }
53    }
54
55    pub fn estimate(&self) -> f64 {
56        let m = self.registers.len() as f64;
57        let alpha = match self.registers.len() {
58            16 => 0.673,
59            32 => 0.697,
60            64 => 0.709,
61            _ => 0.7213 / (1.0 + 1.079 / m),
62        };
63
64        let sum: f64 = self
65            .registers
66            .iter()
67            .map(|&r| 2.0f64.powi(-(r as i32)))
68            .sum();
69        let raw_estimate = alpha * m * m / sum;
70
71        if raw_estimate <= 2.5 * m {
72            let zeros = self.registers.iter().filter(|&&r| r == 0).count() as f64;
73            if zeros > 0.0 {
74                return m * (m / zeros).ln();
75            }
76        }
77
78        raw_estimate
79    }
80
81    pub fn merge(&mut self, other: &HyperLogLog) {
82        for (i, &r) in other.registers.iter().enumerate() {
83            if i < self.registers.len() && r > self.registers[i] {
84                self.registers[i] = r;
85            }
86        }
87    }
88
89    pub fn memory_bytes(&self) -> usize {
90        self.registers.len()
91    }
92
93    /// Access raw registers for serialization.
94    pub fn registers(&self) -> &[u8] {
95        &self.registers
96    }
97
98    /// Reconstruct from serialized registers (assumes precision 14).
99    pub fn from_registers(data: &[u8]) -> Self {
100        let precision = match data.len() {
101            16 => 4,
102            32 => 5,
103            64 => 6,
104            128 => 7,
105            256 => 8,
106            512 => 9,
107            1024 => 10,
108            2048 => 11,
109            4096 => 12,
110            8192 => 13,
111            16384 => 14,
112            32768 => 15,
113            65536 => 16,
114            131072 => 17,
115            262144 => 18,
116            _ => 14,
117        };
118        Self {
119            registers: data.to_vec(),
120            precision,
121        }
122    }
123}
124
125impl Default for HyperLogLog {
126    fn default() -> Self {
127        Self::new()
128    }
129}
130
131/// Splitmix64 hash — excellent avalanche for sequential integers.
132fn splitmix64(mut x: u64) -> u64 {
133    x = x.wrapping_add(0x9e3779b97f4a7c15);
134    x = (x ^ (x >> 30)).wrapping_mul(0xbf58476d1ce4e5b9);
135    x = (x ^ (x >> 27)).wrapping_mul(0x94d049bb133111eb);
136    x ^ (x >> 31)
137}
138
139#[cfg(test)]
140mod tests {
141    use super::*;
142
143    #[test]
144    fn hll_empty() {
145        let hll = HyperLogLog::new();
146        assert!(hll.estimate() < 1.0);
147    }
148
149    #[test]
150    fn hll_small_cardinality() {
151        let mut hll = HyperLogLog::new();
152        for i in 0..100u64 {
153            hll.add(i);
154        }
155        let est = hll.estimate();
156        assert!((90.0..110.0).contains(&est), "expected ~100, got {est:.0}");
157    }
158
159    #[test]
160    fn hll_large_cardinality() {
161        let mut hll = HyperLogLog::new();
162        for i in 0..100_000u64 {
163            hll.add(i);
164        }
165        let est = hll.estimate();
166        let error = (est - 100_000.0).abs() / 100_000.0;
167        assert!(error < 0.05, "expected <5% error, got {error:.3}");
168    }
169
170    #[test]
171    fn hll_duplicates_ignored() {
172        let mut hll = HyperLogLog::new();
173        for _ in 0..10_000 {
174            hll.add(42);
175        }
176        let est = hll.estimate();
177        assert!(est < 5.0, "expected ~1, got {est:.0}");
178    }
179
180    #[test]
181    fn hll_merge() {
182        let mut a = HyperLogLog::new();
183        let mut b = HyperLogLog::new();
184        for i in 0..5000u64 {
185            a.add(i);
186        }
187        for i in 3000..8000u64 {
188            b.add(i);
189        }
190        a.merge(&b);
191        let est = a.estimate();
192        let error = (est - 8000.0).abs() / 8000.0;
193        assert!(error < 0.05, "expected ~8000, got {est:.0}");
194    }
195
196    #[test]
197    fn hll_f64_batch() {
198        let mut hll = HyperLogLog::new();
199        let values: Vec<f64> = (0..1000).map(|i| i as f64 * 0.1).collect();
200        hll.add_f64_batch(&values);
201        let est = hll.estimate();
202        assert!(
203            (900.0..1100.0).contains(&est),
204            "expected ~1000, got {est:.0}"
205        );
206    }
207
208    #[test]
209    fn hll_memory() {
210        let hll = HyperLogLog::new();
211        assert_eq!(hll.memory_bytes(), 16384);
212    }
213}