1use std::collections::BTreeMap;
35use std::hash::{Hash, Hasher};
36use twox_hash::XxHash64;
37
38const SPARSE_THRESHOLD_DIVISOR: usize = 4;
41
42#[derive(Debug, Clone)]
44enum HllRepresentation {
45 Sparse(BTreeMap<u16, u8>),
48 Dense(Vec<u8>),
51}
52
53#[derive(Debug, Clone)]
63pub struct HyperLogLog {
64 precision: u8,
66 num_registers: usize,
68 representation: HllRepresentation,
70}
71
72impl HyperLogLog {
73 pub fn new(precision: u8) -> Self {
83 assert!((4..=18).contains(&precision), "Precision must be 4-18");
84 let num_registers = 1 << precision;
85 Self {
86 precision,
87 num_registers,
88 representation: HllRepresentation::Sparse(BTreeMap::new()),
89 }
90 }
91
92 pub fn new_dense(precision: u8) -> Self {
95 assert!((4..=18).contains(&precision), "Precision must be 4-18");
96 let num_registers = 1 << precision;
97 Self {
98 precision,
99 num_registers,
100 representation: HllRepresentation::Dense(vec![0; num_registers]),
101 }
102 }
103
104 pub fn default_precision() -> Self {
106 Self::new(14)
107 }
108
109 #[inline]
111 pub fn is_sparse(&self) -> bool {
112 matches!(self.representation, HllRepresentation::Sparse(_))
113 }
114
115 #[inline]
117 fn sparse_threshold(&self) -> usize {
118 self.num_registers / SPARSE_THRESHOLD_DIVISOR
119 }
120
121 fn convert_to_dense(&mut self) {
123 if let HllRepresentation::Sparse(ref sparse_map) = self.representation {
124 let mut registers = vec![0u8; self.num_registers];
125 for (&idx, &rho) in sparse_map.iter() {
126 registers[idx as usize] = rho;
127 }
128 self.representation = HllRepresentation::Dense(registers);
129 }
130 }
131
132 #[inline]
134 fn update_register(&mut self, register_idx: usize, rho: u8) {
135 match &mut self.representation {
136 HllRepresentation::Sparse(sparse_map) => {
137 let idx = register_idx as u16;
138 let current = sparse_map.get(&idx).copied().unwrap_or(0);
139 if rho > current {
140 sparse_map.insert(idx, rho);
141 }
142 if sparse_map.len() > self.sparse_threshold() {
144 self.convert_to_dense();
145 }
146 }
147 HllRepresentation::Dense(registers) => {
148 registers[register_idx] = registers[register_idx].max(rho);
149 }
150 }
151 }
152
153 #[inline]
155 #[allow(dead_code)]
156 fn get_register(&self, register_idx: usize) -> u8 {
157 match &self.representation {
158 HllRepresentation::Sparse(sparse_map) => {
159 sparse_map.get(&(register_idx as u16)).copied().unwrap_or(0)
160 }
161 HllRepresentation::Dense(registers) => registers[register_idx],
162 }
163 }
164
165 #[inline]
170 fn hash<T: Hash>(&self, item: &T) -> u64 {
171 let mut hasher = XxHash64::default();
172 item.hash(&mut hasher);
173 hasher.finish()
174 }
175
176 #[inline]
178 pub fn add<T: Hash>(&mut self, item: &T) {
179 let hash = self.hash(item);
180
181 let register_idx = (hash >> (64 - self.precision)) as usize;
183
184 let remaining = hash << self.precision;
186 let rho = if remaining == 0 {
187 64 - self.precision + 1
188 } else {
189 remaining.leading_zeros() as u8 + 1
190 };
191
192 self.update_register(register_idx, rho);
194 }
195
196 #[inline]
198 pub fn add_hash(&mut self, hash: u64) {
199 let register_idx = (hash >> (64 - self.precision)) as usize;
200 let remaining = hash << self.precision;
201 let rho = if remaining == 0 {
202 64 - self.precision + 1
203 } else {
204 remaining.leading_zeros() as u8 + 1
205 };
206 self.update_register(register_idx, rho);
207 }
208
209 pub fn cardinality(&self) -> u64 {
214 let m = self.num_registers as f64;
215
216 let alpha_m = match self.precision {
218 4 => 0.673,
219 5 => 0.697,
220 6 => 0.709,
221 _ => 0.7213 / (1.0 + 1.079 / m),
222 };
223
224 let (sum, zeros) = self.compute_harmonic_sum_and_zeros();
226
227 let raw_estimate = alpha_m * m * m / sum;
228
229 let estimate = if raw_estimate <= 5.0 * m {
232 let bias = self.estimate_bias(raw_estimate);
234 let corrected = raw_estimate - bias;
235
236 if zeros > 0.0 {
238 let linear_estimate = m * (m / zeros).ln();
239 if linear_estimate <= Self::linear_counting_threshold(self.precision) {
241 return linear_estimate as u64;
242 }
243 }
244 corrected
245 } else {
246 raw_estimate
247 };
248
249 if estimate > (1u64 << 32) as f64 / 30.0 {
251 let two_to_32 = (1u64 << 32) as f64;
252 return (-two_to_32 * (1.0 - estimate / two_to_32).ln()) as u64;
253 }
254
255 estimate.max(0.0) as u64
256 }
257
258 #[inline]
260 fn compute_harmonic_sum_and_zeros(&self) -> (f64, f64) {
261 match &self.representation {
262 HllRepresentation::Sparse(sparse_map) => {
263 let mut sum = 0.0_f64;
264 for &rho in sparse_map.values() {
265 sum += 2.0_f64.powi(-(rho as i32));
266 }
267 let zeros = (self.num_registers - sparse_map.len()) as f64;
269 sum += zeros; (sum, zeros)
271 }
272 HllRepresentation::Dense(registers) => {
273 let mut sum = 0.0_f64;
274 let mut zeros = 0.0_f64;
275 for &r in registers.iter() {
276 sum += 2.0_f64.powi(-(r as i32));
277 if r == 0 {
278 zeros += 1.0;
279 }
280 }
281 (sum, zeros)
282 }
283 }
284 }
285
286 #[inline]
289 fn estimate_bias(&self, raw_estimate: f64) -> f64 {
290 let m = self.num_registers as f64;
293 if raw_estimate < 0.5 * m {
294 0.7 * m * (0.5 * m / raw_estimate).min(1.0)
295 } else if raw_estimate < 2.5 * m {
296 0.2 * m * (2.5 * m - raw_estimate) / (2.0 * m)
297 } else {
298 0.0
299 }
300 }
301
302 #[inline]
305 fn linear_counting_threshold(precision: u8) -> f64 {
306 match precision {
308 4 => 10.0,
309 5 => 20.0,
310 6 => 40.0,
311 7 => 80.0,
312 8 => 220.0,
313 9 => 400.0,
314 10 => 900.0,
315 11 => 1800.0,
316 12 => 3100.0,
317 13 => 6500.0,
318 14 => 11500.0,
319 15 => 20000.0,
320 16 => 50000.0,
321 17 => 120000.0,
322 18 => 350000.0,
323 _ => 11500.0, }
325 }
326
327 pub fn merge(&mut self, other: &HyperLogLog) {
332 assert_eq!(self.precision, other.precision, "Precision mismatch");
333
334 match (&mut self.representation, &other.representation) {
335 (HllRepresentation::Sparse(self_map), HllRepresentation::Sparse(other_map)) => {
337 for (&idx, &rho) in other_map.iter() {
338 let current = self_map.get(&idx).copied().unwrap_or(0);
339 if rho > current {
340 self_map.insert(idx, rho);
341 }
342 }
343 if self_map.len() > self.num_registers / SPARSE_THRESHOLD_DIVISOR {
345 self.convert_to_dense();
346 }
347 }
348 (HllRepresentation::Dense(self_regs), HllRepresentation::Dense(other_regs)) => {
350 for (i, &r) in other_regs.iter().enumerate() {
351 self_regs[i] = self_regs[i].max(r);
352 }
353 }
354 (HllRepresentation::Sparse(_), HllRepresentation::Dense(other_regs)) => {
356 self.convert_to_dense();
357 if let HllRepresentation::Dense(self_regs) = &mut self.representation {
358 for (i, &r) in other_regs.iter().enumerate() {
359 self_regs[i] = self_regs[i].max(r);
360 }
361 }
362 }
363 (HllRepresentation::Dense(self_regs), HllRepresentation::Sparse(other_map)) => {
365 for (&idx, &rho) in other_map.iter() {
366 let i = idx as usize;
367 self_regs[i] = self_regs[i].max(rho);
368 }
369 }
370 }
371 }
372
373 pub fn is_empty(&self) -> bool {
375 match &self.representation {
376 HllRepresentation::Sparse(sparse_map) => sparse_map.is_empty(),
377 HllRepresentation::Dense(registers) => registers.iter().all(|&r| r == 0),
378 }
379 }
380
381 pub fn clear(&mut self) {
383 self.representation = HllRepresentation::Sparse(BTreeMap::new());
384 }
385
386 pub fn memory_usage(&self) -> usize {
388 std::mem::size_of::<Self>()
389 + match &self.representation {
390 HllRepresentation::Sparse(sparse_map) => {
391 sparse_map.len() * (std::mem::size_of::<u16>() + std::mem::size_of::<u8>())
393 }
394 HllRepresentation::Dense(registers) => registers.len(),
395 }
396 }
397
398 pub fn precision(&self) -> u8 {
400 self.precision
401 }
402
403 pub fn standard_error(&self) -> f64 {
405 1.04 / (self.num_registers as f64).sqrt() * 100.0
406 }
407
408 pub fn non_zero_count(&self) -> usize {
410 match &self.representation {
411 HllRepresentation::Sparse(sparse_map) => sparse_map.len(),
412 HllRepresentation::Dense(registers) => registers.iter().filter(|&&r| r != 0).count(),
413 }
414 }
415}
416
417impl Default for HyperLogLog {
418 fn default() -> Self {
419 Self::default_precision()
420 }
421}
422
423#[cfg(test)]
424mod tests {
425 use super::*;
426
427 #[test]
428 fn test_basic_cardinality() {
429 let mut hll = HyperLogLog::new(14);
430
431 for i in 0..1000u64 {
433 hll.add(&i);
434 }
435
436 let estimate = hll.cardinality();
437 let error = (estimate as f64 - 1000.0).abs() / 1000.0;
438
439 assert!(error < 0.05, "Error was {}%", error * 100.0);
441 }
442
443 #[test]
444 fn test_duplicates() {
445 let mut hll = HyperLogLog::new(14);
446
447 for _ in 0..1000 {
449 hll.add(&42u64);
450 }
451
452 let estimate = hll.cardinality();
454 assert!(estimate <= 2, "Estimate was {}", estimate);
455 }
456
457 #[test]
458 fn test_merge() {
459 let mut hll1 = HyperLogLog::new(14);
460 let mut hll2 = HyperLogLog::new(14);
461
462 for i in 0..500u64 {
464 hll1.add(&i);
465 }
466 for i in 500..1000u64 {
467 hll2.add(&i);
468 }
469
470 hll1.merge(&hll2);
471
472 let estimate = hll1.cardinality();
473 let error = (estimate as f64 - 1000.0).abs() / 1000.0;
474
475 assert!(error < 0.05, "Error was {}%", error * 100.0);
476 }
477
478 #[test]
479 fn test_string_items() {
480 let mut hll = HyperLogLog::new(14);
481
482 for i in 0..1000 {
483 hll.add(&format!("session_{}", i));
484 }
485
486 let estimate = hll.cardinality();
487 let error = (estimate as f64 - 1000.0).abs() / 1000.0;
488
489 assert!(error < 0.05, "Error was {}%", error * 100.0);
490 }
491
492 #[test]
495 fn test_sparse_mode_small_cardinality() {
496 let mut hll = HyperLogLog::new(14);
497
498 for i in 0..100u64 {
500 hll.add(&i);
501 }
502
503 assert!(
504 hll.is_sparse(),
505 "Should remain in sparse mode for 100 items"
506 );
507
508 let memory = hll.memory_usage();
510 assert!(
511 memory < 1000,
512 "Sparse mode memory should be < 1KB, was {}",
513 memory
514 );
515
516 let estimate = hll.cardinality();
518 let error = (estimate as f64 - 100.0).abs() / 100.0;
519 assert!(error < 0.10, "Error was {}%", error * 100.0);
520 }
521
522 #[test]
523 fn test_auto_conversion_to_dense() {
524 let mut hll = HyperLogLog::new(10); assert!(hll.is_sparse(), "Should start in sparse mode");
527
528 for i in 0..400u64 {
531 hll.add(&i);
532 }
533
534 assert!(
535 !hll.is_sparse(),
536 "Should convert to dense mode after threshold"
537 );
538 }
539
540 #[test]
541 fn test_new_dense_constructor() {
542 let hll = HyperLogLog::new_dense(14);
543 assert!(!hll.is_sparse(), "new_dense should create dense HLL");
544 }
545
546 #[test]
547 fn test_merge_sparse_sparse() {
548 let mut hll1 = HyperLogLog::new(14);
549 let mut hll2 = HyperLogLog::new(14);
550
551 for i in 0..50u64 {
552 hll1.add(&i);
553 }
554 for i in 50..100u64 {
555 hll2.add(&i);
556 }
557
558 assert!(hll1.is_sparse());
559 assert!(hll2.is_sparse());
560
561 hll1.merge(&hll2);
562
563 assert!(hll1.is_sparse());
565
566 let estimate = hll1.cardinality();
567 let error = (estimate as f64 - 100.0).abs() / 100.0;
568 assert!(error < 0.15, "Error was {}%", error * 100.0);
569 }
570
571 #[test]
572 fn test_merge_dense_sparse() {
573 let mut hll_dense = HyperLogLog::new_dense(14);
574 let mut hll_sparse = HyperLogLog::new(14);
575
576 for i in 0..500u64 {
577 hll_dense.add(&i);
578 }
579 for i in 500..600u64 {
580 hll_sparse.add(&i);
581 }
582
583 assert!(!hll_dense.is_sparse());
584 assert!(hll_sparse.is_sparse());
585
586 hll_dense.merge(&hll_sparse);
587
588 let estimate = hll_dense.cardinality();
589 let error = (estimate as f64 - 600.0).abs() / 600.0;
590 assert!(error < 0.05, "Error was {}%", error * 100.0);
591 }
592
593 #[test]
594 fn test_clear_resets_to_sparse() {
595 let mut hll = HyperLogLog::new_dense(14);
596
597 for i in 0..1000u64 {
598 hll.add(&i);
599 }
600
601 assert!(!hll.is_sparse());
602
603 hll.clear();
604
605 assert!(hll.is_sparse(), "Clear should reset to sparse mode");
606 assert!(hll.is_empty());
607 }
608}