Skip to main content

nodedb_types/approx/
hll.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! HyperLogLog — approximate count distinct (12 KB memory, ~0.8% error).
4
5/// HyperLogLog cardinality estimator.
6///
7/// Uses 2^14 = 16384 registers (12 KB memory). Achieves ~0.8% relative
8/// error at any cardinality. Mergeable: `hll_a.merge(&hll_b)` produces
9/// the union cardinality.
10#[derive(Debug, serde::Serialize, serde::Deserialize)]
11pub struct HyperLogLog {
12    registers: Vec<u8>,
13    precision: u8,
14}
15
16impl HyperLogLog {
17    pub fn new() -> Self {
18        Self::with_precision(14)
19    }
20
21    pub fn with_precision(p: u8) -> Self {
22        let p = p.clamp(4, 18);
23        let m = 1usize << p;
24        Self {
25            registers: vec![0u8; m],
26            precision: p,
27        }
28    }
29
30    pub fn add(&mut self, value: u64) {
31        let hash = splitmix64(value);
32        let m = self.registers.len();
33        let idx = (hash as usize) & (m - 1);
34        let remaining = hash >> self.precision;
35        let leading_zeros = if remaining == 0 {
36            (64 - self.precision) + 1
37        } else {
38            (remaining.leading_zeros() as u8).saturating_sub(self.precision) + 1
39        };
40        if leading_zeros > self.registers[idx] {
41            self.registers[idx] = leading_zeros;
42        }
43    }
44
45    pub fn add_batch(&mut self, values: &[u64]) {
46        for &v in values {
47            self.add(v);
48        }
49    }
50
51    pub fn add_f64_batch(&mut self, values: &[f64]) {
52        for &v in values {
53            self.add(v.to_bits());
54        }
55    }
56
57    pub fn estimate(&self) -> f64 {
58        let m = self.registers.len() as f64;
59        let alpha = match self.registers.len() {
60            16 => 0.673,
61            32 => 0.697,
62            64 => 0.709,
63            _ => 0.7213 / (1.0 + 1.079 / m),
64        };
65
66        let sum: f64 = self
67            .registers
68            .iter()
69            .map(|&r| 2.0f64.powi(-(r as i32)))
70            .sum();
71        let raw_estimate = alpha * m * m / sum;
72
73        if raw_estimate <= 2.5 * m {
74            let zeros = self.registers.iter().filter(|&&r| r == 0).count() as f64;
75            if zeros > 0.0 {
76                return m * (m / zeros).ln();
77            }
78        }
79
80        raw_estimate
81    }
82
83    pub fn merge(&mut self, other: &HyperLogLog) {
84        for (i, &r) in other.registers.iter().enumerate() {
85            if i < self.registers.len() && r > self.registers[i] {
86                self.registers[i] = r;
87            }
88        }
89    }
90
91    pub fn memory_bytes(&self) -> usize {
92        self.registers.len()
93    }
94
95    /// Access raw registers for serialization.
96    pub fn registers(&self) -> &[u8] {
97        &self.registers
98    }
99
100    /// Reconstruct from serialized registers (assumes precision 14).
101    pub fn from_registers(data: &[u8]) -> Self {
102        let precision = match data.len() {
103            16 => 4,
104            32 => 5,
105            64 => 6,
106            128 => 7,
107            256 => 8,
108            512 => 9,
109            1024 => 10,
110            2048 => 11,
111            4096 => 12,
112            8192 => 13,
113            16384 => 14,
114            32768 => 15,
115            65536 => 16,
116            131072 => 17,
117            262144 => 18,
118            _ => 14,
119        };
120        Self {
121            registers: data.to_vec(),
122            precision,
123        }
124    }
125}
126
127impl Default for HyperLogLog {
128    fn default() -> Self {
129        Self::new()
130    }
131}
132
133/// Splitmix64 hash — excellent avalanche for sequential integers.
134fn splitmix64(mut x: u64) -> u64 {
135    x = x.wrapping_add(0x9e3779b97f4a7c15);
136    x = (x ^ (x >> 30)).wrapping_mul(0xbf58476d1ce4e5b9);
137    x = (x ^ (x >> 27)).wrapping_mul(0x94d049bb133111eb);
138    x ^ (x >> 31)
139}
140
141#[cfg(test)]
142mod tests {
143    use super::*;
144
145    #[test]
146    fn hll_empty() {
147        let hll = HyperLogLog::new();
148        assert!(hll.estimate() < 1.0);
149    }
150
151    #[test]
152    fn hll_small_cardinality() {
153        let mut hll = HyperLogLog::new();
154        for i in 0..100u64 {
155            hll.add(i);
156        }
157        let est = hll.estimate();
158        assert!((90.0..110.0).contains(&est), "expected ~100, got {est:.0}");
159    }
160
161    #[test]
162    fn hll_large_cardinality() {
163        let mut hll = HyperLogLog::new();
164        for i in 0..100_000u64 {
165            hll.add(i);
166        }
167        let est = hll.estimate();
168        let error = (est - 100_000.0).abs() / 100_000.0;
169        assert!(error < 0.05, "expected <5% error, got {error:.3}");
170    }
171
172    #[test]
173    fn hll_duplicates_ignored() {
174        let mut hll = HyperLogLog::new();
175        for _ in 0..10_000 {
176            hll.add(42);
177        }
178        let est = hll.estimate();
179        assert!(est < 5.0, "expected ~1, got {est:.0}");
180    }
181
182    #[test]
183    fn hll_merge() {
184        let mut a = HyperLogLog::new();
185        let mut b = HyperLogLog::new();
186        for i in 0..5000u64 {
187            a.add(i);
188        }
189        for i in 3000..8000u64 {
190            b.add(i);
191        }
192        a.merge(&b);
193        let est = a.estimate();
194        let error = (est - 8000.0).abs() / 8000.0;
195        assert!(error < 0.05, "expected ~8000, got {est:.0}");
196    }
197
198    #[test]
199    fn hll_f64_batch() {
200        let mut hll = HyperLogLog::new();
201        let values: Vec<f64> = (0..1000).map(|i| i as f64 * 0.1).collect();
202        hll.add_f64_batch(&values);
203        let est = hll.estimate();
204        assert!(
205            (900.0..1100.0).contains(&est),
206            "expected ~1000, got {est:.0}"
207        );
208    }
209
210    #[test]
211    fn hll_memory() {
212        let hll = HyperLogLog::new();
213        assert_eq!(hll.memory_bytes(), 16384);
214    }
215
216    #[test]
217    fn hll_serde_roundtrip_merge_semantics() {
218        let mut a = HyperLogLog::new();
219        let mut b = HyperLogLog::new();
220        for i in 0..1000u64 {
221            a.add(i);
222        }
223        for i in 500..1500u64 {
224            b.add(i);
225        }
226
227        // Serialize and deserialize a via serde_json (serde derive, not zerompk).
228        let json = serde_json::to_vec(&a).expect("serialize HLL");
229        let mut a_prime: HyperLogLog = serde_json::from_slice(&json).expect("deserialize HLL");
230
231        // Merge b into deserialized a'.
232        a_prime.merge(&b);
233
234        // Merge b into original a.
235        a.merge(&b);
236
237        // Both should estimate ~1500 (the union cardinality).
238        let est_orig = a.estimate();
239        let est_rt = a_prime.estimate();
240        // Allow 1% relative difference between the two paths (they are identical state).
241        let diff = (est_orig - est_rt).abs() / est_orig.max(1.0);
242        assert!(
243            diff < 0.01,
244            "roundtrip diverged: orig={est_orig:.0}, roundtrip={est_rt:.0}"
245        );
246    }
247}