nodedb_types/approx/
hll.rs1#[derive(Debug, serde::Serialize, serde::Deserialize)]
11pub struct HyperLogLog {
12 registers: Vec<u8>,
13 precision: u8,
14}
15
16impl HyperLogLog {
17 pub fn new() -> Self {
18 Self::with_precision(14)
19 }
20
21 pub fn with_precision(p: u8) -> Self {
22 let p = p.clamp(4, 18);
23 let m = 1usize << p;
24 Self {
25 registers: vec![0u8; m],
26 precision: p,
27 }
28 }
29
30 pub fn add(&mut self, value: u64) {
31 let hash = splitmix64(value);
32 let m = self.registers.len();
33 let idx = (hash as usize) & (m - 1);
34 let remaining = hash >> self.precision;
35 let leading_zeros = if remaining == 0 {
36 (64 - self.precision) + 1
37 } else {
38 (remaining.leading_zeros() as u8).saturating_sub(self.precision) + 1
39 };
40 if leading_zeros > self.registers[idx] {
41 self.registers[idx] = leading_zeros;
42 }
43 }
44
45 pub fn add_batch(&mut self, values: &[u64]) {
46 for &v in values {
47 self.add(v);
48 }
49 }
50
51 pub fn add_f64_batch(&mut self, values: &[f64]) {
52 for &v in values {
53 self.add(v.to_bits());
54 }
55 }
56
57 pub fn estimate(&self) -> f64 {
58 let m = self.registers.len() as f64;
59 let alpha = match self.registers.len() {
60 16 => 0.673,
61 32 => 0.697,
62 64 => 0.709,
63 _ => 0.7213 / (1.0 + 1.079 / m),
64 };
65
66 let sum: f64 = self
67 .registers
68 .iter()
69 .map(|&r| 2.0f64.powi(-(r as i32)))
70 .sum();
71 let raw_estimate = alpha * m * m / sum;
72
73 if raw_estimate <= 2.5 * m {
74 let zeros = self.registers.iter().filter(|&&r| r == 0).count() as f64;
75 if zeros > 0.0 {
76 return m * (m / zeros).ln();
77 }
78 }
79
80 raw_estimate
81 }
82
83 pub fn merge(&mut self, other: &HyperLogLog) {
84 for (i, &r) in other.registers.iter().enumerate() {
85 if i < self.registers.len() && r > self.registers[i] {
86 self.registers[i] = r;
87 }
88 }
89 }
90
91 pub fn memory_bytes(&self) -> usize {
92 self.registers.len()
93 }
94
95 pub fn registers(&self) -> &[u8] {
97 &self.registers
98 }
99
100 pub fn from_registers(data: &[u8]) -> Self {
102 let precision = match data.len() {
103 16 => 4,
104 32 => 5,
105 64 => 6,
106 128 => 7,
107 256 => 8,
108 512 => 9,
109 1024 => 10,
110 2048 => 11,
111 4096 => 12,
112 8192 => 13,
113 16384 => 14,
114 32768 => 15,
115 65536 => 16,
116 131072 => 17,
117 262144 => 18,
118 _ => 14,
119 };
120 Self {
121 registers: data.to_vec(),
122 precision,
123 }
124 }
125}
126
127impl Default for HyperLogLog {
128 fn default() -> Self {
129 Self::new()
130 }
131}
132
133fn splitmix64(mut x: u64) -> u64 {
135 x = x.wrapping_add(0x9e3779b97f4a7c15);
136 x = (x ^ (x >> 30)).wrapping_mul(0xbf58476d1ce4e5b9);
137 x = (x ^ (x >> 27)).wrapping_mul(0x94d049bb133111eb);
138 x ^ (x >> 31)
139}
140
141#[cfg(test)]
142mod tests {
143 use super::*;
144
145 #[test]
146 fn hll_empty() {
147 let hll = HyperLogLog::new();
148 assert!(hll.estimate() < 1.0);
149 }
150
151 #[test]
152 fn hll_small_cardinality() {
153 let mut hll = HyperLogLog::new();
154 for i in 0..100u64 {
155 hll.add(i);
156 }
157 let est = hll.estimate();
158 assert!((90.0..110.0).contains(&est), "expected ~100, got {est:.0}");
159 }
160
161 #[test]
162 fn hll_large_cardinality() {
163 let mut hll = HyperLogLog::new();
164 for i in 0..100_000u64 {
165 hll.add(i);
166 }
167 let est = hll.estimate();
168 let error = (est - 100_000.0).abs() / 100_000.0;
169 assert!(error < 0.05, "expected <5% error, got {error:.3}");
170 }
171
172 #[test]
173 fn hll_duplicates_ignored() {
174 let mut hll = HyperLogLog::new();
175 for _ in 0..10_000 {
176 hll.add(42);
177 }
178 let est = hll.estimate();
179 assert!(est < 5.0, "expected ~1, got {est:.0}");
180 }
181
182 #[test]
183 fn hll_merge() {
184 let mut a = HyperLogLog::new();
185 let mut b = HyperLogLog::new();
186 for i in 0..5000u64 {
187 a.add(i);
188 }
189 for i in 3000..8000u64 {
190 b.add(i);
191 }
192 a.merge(&b);
193 let est = a.estimate();
194 let error = (est - 8000.0).abs() / 8000.0;
195 assert!(error < 0.05, "expected ~8000, got {est:.0}");
196 }
197
198 #[test]
199 fn hll_f64_batch() {
200 let mut hll = HyperLogLog::new();
201 let values: Vec<f64> = (0..1000).map(|i| i as f64 * 0.1).collect();
202 hll.add_f64_batch(&values);
203 let est = hll.estimate();
204 assert!(
205 (900.0..1100.0).contains(&est),
206 "expected ~1000, got {est:.0}"
207 );
208 }
209
210 #[test]
211 fn hll_memory() {
212 let hll = HyperLogLog::new();
213 assert_eq!(hll.memory_bytes(), 16384);
214 }
215
216 #[test]
217 fn hll_serde_roundtrip_merge_semantics() {
218 let mut a = HyperLogLog::new();
219 let mut b = HyperLogLog::new();
220 for i in 0..1000u64 {
221 a.add(i);
222 }
223 for i in 500..1500u64 {
224 b.add(i);
225 }
226
227 let json = serde_json::to_vec(&a).expect("serialize HLL");
229 let mut a_prime: HyperLogLog = serde_json::from_slice(&json).expect("deserialize HLL");
230
231 a_prime.merge(&b);
233
234 a.merge(&b);
236
237 let est_orig = a.estimate();
239 let est_rt = a_prime.estimate();
240 let diff = (est_orig - est_rt).abs() / est_orig.max(1.0);
242 assert!(
243 diff < 0.01,
244 "roundtrip diverged: orig={est_orig:.0}, roundtrip={est_rt:.0}"
245 );
246 }
247}