1use std::collections::BTreeMap;
38use std::hash::{Hash, Hasher};
39use twox_hash::XxHash64;
40
41const SPARSE_THRESHOLD_DIVISOR: usize = 4;
44
45#[derive(Debug, Clone)]
47enum HllRepresentation {
48 Sparse(BTreeMap<u16, u8>),
51 Dense(Vec<u8>),
54}
55
56#[derive(Debug, Clone)]
66pub struct HyperLogLog {
67 precision: u8,
69 num_registers: usize,
71 representation: HllRepresentation,
73}
74
75impl HyperLogLog {
76 pub fn new(precision: u8) -> Self {
86 assert!((4..=18).contains(&precision), "Precision must be 4-18");
87 let num_registers = 1 << precision;
88 Self {
89 precision,
90 num_registers,
91 representation: HllRepresentation::Sparse(BTreeMap::new()),
92 }
93 }
94
95 pub fn new_dense(precision: u8) -> Self {
98 assert!((4..=18).contains(&precision), "Precision must be 4-18");
99 let num_registers = 1 << precision;
100 Self {
101 precision,
102 num_registers,
103 representation: HllRepresentation::Dense(vec![0; num_registers]),
104 }
105 }
106
107 pub fn default_precision() -> Self {
109 Self::new(14)
110 }
111
112 #[inline]
114 pub fn is_sparse(&self) -> bool {
115 matches!(self.representation, HllRepresentation::Sparse(_))
116 }
117
118 #[inline]
120 fn sparse_threshold(&self) -> usize {
121 self.num_registers / SPARSE_THRESHOLD_DIVISOR
122 }
123
124 fn convert_to_dense(&mut self) {
126 if let HllRepresentation::Sparse(ref sparse_map) = self.representation {
127 let mut registers = vec![0u8; self.num_registers];
128 for (&idx, &rho) in sparse_map.iter() {
129 registers[idx as usize] = rho;
130 }
131 self.representation = HllRepresentation::Dense(registers);
132 }
133 }
134
135 #[inline]
137 fn update_register(&mut self, register_idx: usize, rho: u8) {
138 match &mut self.representation {
139 HllRepresentation::Sparse(sparse_map) => {
140 let idx = register_idx as u16;
141 let current = sparse_map.get(&idx).copied().unwrap_or(0);
142 if rho > current {
143 sparse_map.insert(idx, rho);
144 }
145 if sparse_map.len() > self.sparse_threshold() {
147 self.convert_to_dense();
148 }
149 }
150 HllRepresentation::Dense(registers) => {
151 registers[register_idx] = registers[register_idx].max(rho);
152 }
153 }
154 }
155
156 #[inline]
158 #[allow(dead_code)]
159 fn get_register(&self, register_idx: usize) -> u8 {
160 match &self.representation {
161 HllRepresentation::Sparse(sparse_map) => {
162 sparse_map.get(&(register_idx as u16)).copied().unwrap_or(0)
163 }
164 HllRepresentation::Dense(registers) => registers[register_idx],
165 }
166 }
167
168 #[inline]
173 fn hash<T: Hash>(&self, item: &T) -> u64 {
174 let mut hasher = XxHash64::default();
175 item.hash(&mut hasher);
176 hasher.finish()
177 }
178
179 #[inline]
181 pub fn add<T: Hash>(&mut self, item: &T) {
182 let hash = self.hash(item);
183
184 let register_idx = (hash >> (64 - self.precision)) as usize;
186
187 let remaining = hash << self.precision;
189 let rho = if remaining == 0 {
190 64 - self.precision + 1
191 } else {
192 remaining.leading_zeros() as u8 + 1
193 };
194
195 self.update_register(register_idx, rho);
197 }
198
199 #[inline]
201 pub fn add_hash(&mut self, hash: u64) {
202 let register_idx = (hash >> (64 - self.precision)) as usize;
203 let remaining = hash << self.precision;
204 let rho = if remaining == 0 {
205 64 - self.precision + 1
206 } else {
207 remaining.leading_zeros() as u8 + 1
208 };
209 self.update_register(register_idx, rho);
210 }
211
212 pub fn cardinality(&self) -> u64 {
217 let m = self.num_registers as f64;
218
219 let alpha_m = match self.precision {
221 4 => 0.673,
222 5 => 0.697,
223 6 => 0.709,
224 _ => 0.7213 / (1.0 + 1.079 / m),
225 };
226
227 let (sum, zeros) = self.compute_harmonic_sum_and_zeros();
229
230 let raw_estimate = alpha_m * m * m / sum;
231
232 let estimate = if raw_estimate <= 5.0 * m {
235 let bias = self.estimate_bias(raw_estimate);
237 let corrected = raw_estimate - bias;
238
239 if zeros > 0.0 {
241 let linear_estimate = m * (m / zeros).ln();
242 if linear_estimate <= Self::linear_counting_threshold(self.precision) {
244 return linear_estimate as u64;
245 }
246 }
247 corrected
248 } else {
249 raw_estimate
250 };
251
252 if estimate > (1u64 << 32) as f64 / 30.0 {
254 let two_to_32 = (1u64 << 32) as f64;
255 return (-two_to_32 * (1.0 - estimate / two_to_32).ln()) as u64;
256 }
257
258 estimate.max(0.0) as u64
259 }
260
261 #[inline]
263 fn compute_harmonic_sum_and_zeros(&self) -> (f64, f64) {
264 match &self.representation {
265 HllRepresentation::Sparse(sparse_map) => {
266 let mut sum = 0.0_f64;
267 for &rho in sparse_map.values() {
268 sum += 2.0_f64.powi(-(rho as i32));
269 }
270 let zeros = (self.num_registers - sparse_map.len()) as f64;
272 sum += zeros; (sum, zeros)
274 }
275 HllRepresentation::Dense(registers) => {
276 let mut sum = 0.0_f64;
277 let mut zeros = 0.0_f64;
278 for &r in registers.iter() {
279 sum += 2.0_f64.powi(-(r as i32));
280 if r == 0 {
281 zeros += 1.0;
282 }
283 }
284 (sum, zeros)
285 }
286 }
287 }
288
289 #[inline]
292 fn estimate_bias(&self, raw_estimate: f64) -> f64 {
293 let m = self.num_registers as f64;
296 if raw_estimate < 0.5 * m {
297 0.7 * m * (0.5 * m / raw_estimate).min(1.0)
298 } else if raw_estimate < 2.5 * m {
299 0.2 * m * (2.5 * m - raw_estimate) / (2.0 * m)
300 } else {
301 0.0
302 }
303 }
304
305 #[inline]
308 fn linear_counting_threshold(precision: u8) -> f64 {
309 match precision {
311 4 => 10.0,
312 5 => 20.0,
313 6 => 40.0,
314 7 => 80.0,
315 8 => 220.0,
316 9 => 400.0,
317 10 => 900.0,
318 11 => 1800.0,
319 12 => 3100.0,
320 13 => 6500.0,
321 14 => 11500.0,
322 15 => 20000.0,
323 16 => 50000.0,
324 17 => 120000.0,
325 18 => 350000.0,
326 _ => 11500.0, }
328 }
329
330 pub fn merge(&mut self, other: &HyperLogLog) {
335 assert_eq!(self.precision, other.precision, "Precision mismatch");
336
337 match (&mut self.representation, &other.representation) {
338 (HllRepresentation::Sparse(self_map), HllRepresentation::Sparse(other_map)) => {
340 for (&idx, &rho) in other_map.iter() {
341 let current = self_map.get(&idx).copied().unwrap_or(0);
342 if rho > current {
343 self_map.insert(idx, rho);
344 }
345 }
346 if self_map.len() > self.num_registers / SPARSE_THRESHOLD_DIVISOR {
348 self.convert_to_dense();
349 }
350 }
351 (HllRepresentation::Dense(self_regs), HllRepresentation::Dense(other_regs)) => {
353 for (i, &r) in other_regs.iter().enumerate() {
354 self_regs[i] = self_regs[i].max(r);
355 }
356 }
357 (HllRepresentation::Sparse(_), HllRepresentation::Dense(other_regs)) => {
359 self.convert_to_dense();
360 if let HllRepresentation::Dense(self_regs) = &mut self.representation {
361 for (i, &r) in other_regs.iter().enumerate() {
362 self_regs[i] = self_regs[i].max(r);
363 }
364 }
365 }
366 (HllRepresentation::Dense(self_regs), HllRepresentation::Sparse(other_map)) => {
368 for (&idx, &rho) in other_map.iter() {
369 let i = idx as usize;
370 self_regs[i] = self_regs[i].max(rho);
371 }
372 }
373 }
374 }
375
376 pub fn is_empty(&self) -> bool {
378 match &self.representation {
379 HllRepresentation::Sparse(sparse_map) => sparse_map.is_empty(),
380 HllRepresentation::Dense(registers) => registers.iter().all(|&r| r == 0),
381 }
382 }
383
384 pub fn clear(&mut self) {
386 self.representation = HllRepresentation::Sparse(BTreeMap::new());
387 }
388
389 pub fn memory_usage(&self) -> usize {
391 std::mem::size_of::<Self>()
392 + match &self.representation {
393 HllRepresentation::Sparse(sparse_map) => {
394 sparse_map.len() * (std::mem::size_of::<u16>() + std::mem::size_of::<u8>())
396 }
397 HllRepresentation::Dense(registers) => registers.len(),
398 }
399 }
400
401 pub fn precision(&self) -> u8 {
403 self.precision
404 }
405
406 pub fn standard_error(&self) -> f64 {
408 1.04 / (self.num_registers as f64).sqrt() * 100.0
409 }
410
411 pub fn non_zero_count(&self) -> usize {
413 match &self.representation {
414 HllRepresentation::Sparse(sparse_map) => sparse_map.len(),
415 HllRepresentation::Dense(registers) => registers.iter().filter(|&&r| r != 0).count(),
416 }
417 }
418}
419
420impl Default for HyperLogLog {
421 fn default() -> Self {
422 Self::default_precision()
423 }
424}
425
426#[cfg(test)]
427mod tests {
428 use super::*;
429
430 #[test]
431 fn test_basic_cardinality() {
432 let mut hll = HyperLogLog::new(14);
433
434 for i in 0..1000u64 {
436 hll.add(&i);
437 }
438
439 let estimate = hll.cardinality();
440 let error = (estimate as f64 - 1000.0).abs() / 1000.0;
441
442 assert!(error < 0.05, "Error was {}%", error * 100.0);
444 }
445
446 #[test]
447 fn test_duplicates() {
448 let mut hll = HyperLogLog::new(14);
449
450 for _ in 0..1000 {
452 hll.add(&42u64);
453 }
454
455 let estimate = hll.cardinality();
457 assert!(estimate <= 2, "Estimate was {}", estimate);
458 }
459
460 #[test]
461 fn test_merge() {
462 let mut hll1 = HyperLogLog::new(14);
463 let mut hll2 = HyperLogLog::new(14);
464
465 for i in 0..500u64 {
467 hll1.add(&i);
468 }
469 for i in 500..1000u64 {
470 hll2.add(&i);
471 }
472
473 hll1.merge(&hll2);
474
475 let estimate = hll1.cardinality();
476 let error = (estimate as f64 - 1000.0).abs() / 1000.0;
477
478 assert!(error < 0.05, "Error was {}%", error * 100.0);
479 }
480
481 #[test]
482 fn test_string_items() {
483 let mut hll = HyperLogLog::new(14);
484
485 for i in 0..1000 {
486 hll.add(&format!("session_{}", i));
487 }
488
489 let estimate = hll.cardinality();
490 let error = (estimate as f64 - 1000.0).abs() / 1000.0;
491
492 assert!(error < 0.05, "Error was {}%", error * 100.0);
493 }
494
495 #[test]
498 fn test_sparse_mode_small_cardinality() {
499 let mut hll = HyperLogLog::new(14);
500
501 for i in 0..100u64 {
503 hll.add(&i);
504 }
505
506 assert!(
507 hll.is_sparse(),
508 "Should remain in sparse mode for 100 items"
509 );
510
511 let memory = hll.memory_usage();
513 assert!(
514 memory < 1000,
515 "Sparse mode memory should be < 1KB, was {}",
516 memory
517 );
518
519 let estimate = hll.cardinality();
521 let error = (estimate as f64 - 100.0).abs() / 100.0;
522 assert!(error < 0.10, "Error was {}%", error * 100.0);
523 }
524
525 #[test]
526 fn test_auto_conversion_to_dense() {
527 let mut hll = HyperLogLog::new(10); assert!(hll.is_sparse(), "Should start in sparse mode");
530
531 for i in 0..400u64 {
534 hll.add(&i);
535 }
536
537 assert!(
538 !hll.is_sparse(),
539 "Should convert to dense mode after threshold"
540 );
541 }
542
543 #[test]
544 fn test_new_dense_constructor() {
545 let hll = HyperLogLog::new_dense(14);
546 assert!(!hll.is_sparse(), "new_dense should create dense HLL");
547 }
548
549 #[test]
550 fn test_merge_sparse_sparse() {
551 let mut hll1 = HyperLogLog::new(14);
552 let mut hll2 = HyperLogLog::new(14);
553
554 for i in 0..50u64 {
555 hll1.add(&i);
556 }
557 for i in 50..100u64 {
558 hll2.add(&i);
559 }
560
561 assert!(hll1.is_sparse());
562 assert!(hll2.is_sparse());
563
564 hll1.merge(&hll2);
565
566 assert!(hll1.is_sparse());
568
569 let estimate = hll1.cardinality();
570 let error = (estimate as f64 - 100.0).abs() / 100.0;
571 assert!(error < 0.15, "Error was {}%", error * 100.0);
572 }
573
574 #[test]
575 fn test_merge_dense_sparse() {
576 let mut hll_dense = HyperLogLog::new_dense(14);
577 let mut hll_sparse = HyperLogLog::new(14);
578
579 for i in 0..500u64 {
580 hll_dense.add(&i);
581 }
582 for i in 500..600u64 {
583 hll_sparse.add(&i);
584 }
585
586 assert!(!hll_dense.is_sparse());
587 assert!(hll_sparse.is_sparse());
588
589 hll_dense.merge(&hll_sparse);
590
591 let estimate = hll_dense.cardinality();
592 let error = (estimate as f64 - 600.0).abs() / 600.0;
593 assert!(error < 0.05, "Error was {}%", error * 100.0);
594 }
595
596 #[test]
597 fn test_clear_resets_to_sparse() {
598 let mut hll = HyperLogLog::new_dense(14);
599
600 for i in 0..1000u64 {
601 hll.add(&i);
602 }
603
604 assert!(!hll.is_sparse());
605
606 hll.clear();
607
608 assert!(hll.is_sparse(), "Clear should reset to sparse mode");
609 assert!(hll.is_empty());
610 }
611}