oxihuman_core/
hyperloglog_v2.rs1#![allow(dead_code)]
4
5const HLL_SEED: u64 = 0x9e37_79b9_7f4a_7c15;
8const ALPHA_16: f64 = 0.673;
9const ALPHA_32: f64 = 0.697;
10const ALPHA_64: f64 = 0.709;
11
12pub struct HyperLogLogV2 {
14 registers: Vec<u8>,
15 b: u8,
16 m: usize,
17}
18
19impl HyperLogLogV2 {
20 pub fn new(b: u8) -> Self {
22 let b = b.clamp(4, 16);
23 let m = 1usize << b;
24 HyperLogLogV2 {
25 registers: vec![0u8; m],
26 b,
27 m,
28 }
29 }
30
31 fn hash_item(&self, item: u64) -> u64 {
32 item.wrapping_mul(HLL_SEED)
33 .rotate_left(27)
34 .wrapping_add(HLL_SEED >> 17)
35 }
36
37 fn leading_zeros_plus_one(val: u64, bits: u8) -> u8 {
38 let shifted = val << bits;
39 shifted.leading_zeros() as u8 + 1
40 }
41
42 pub fn add(&mut self, item: u64) {
44 let h = self.hash_item(item);
45 let reg_idx = (h >> (64 - self.b)) as usize;
46 let w = h << self.b;
47 let rho = w.leading_zeros() as u8 + 1;
48 if rho > self.registers[reg_idx] {
49 self.registers[reg_idx] = rho;
50 }
51 }
52
53 pub fn count(&self) -> u64 {
55 let m = self.m as f64;
56 let alpha = match self.m {
57 16 => ALPHA_16,
58 32 => ALPHA_32,
59 64 => ALPHA_64,
60 _ => 0.7213 / (1.0 + 1.079 / m),
61 };
62 let z: f64 = self
63 .registers
64 .iter()
65 .map(|&r| 2.0f64.powi(-(r as i32)))
66 .sum();
67 let raw = alpha * m * m / z;
68 let zeros = self.registers.iter().filter(|&&r| r == 0).count();
70 if raw < 2.5 * m && zeros > 0 {
71 (m * (m / zeros as f64).ln()).round() as u64
72 } else {
73 raw.round() as u64
74 }
75 }
76
77 pub fn precision(&self) -> u8 {
79 self.b
80 }
81
82 pub fn num_registers(&self) -> usize {
84 self.m
85 }
86
87 pub fn merge(&mut self, other: &HyperLogLogV2) {
89 if other.b == self.b {
90 for (r, &o) in self.registers.iter_mut().zip(other.registers.iter()) {
91 if o > *r {
92 *r = o;
93 }
94 }
95 }
96 }
97}
98
99#[cfg(test)]
100mod tests {
101 use super::*;
102
103 #[test]
104 fn test_empty_cardinality() {
105 let hll = HyperLogLogV2::new(8);
106 assert_eq!(hll.count(), 0 );
107 }
108
109 #[test]
110 fn test_single_element() {
111 let mut hll = HyperLogLogV2::new(8);
112 hll.add(1234);
113 assert!(hll.count() >= 1 );
114 }
115
116 #[test]
117 fn test_many_elements_approximate() {
118 let mut hll = HyperLogLogV2::new(12);
119 for i in 0u64..1000 {
120 hll.add(i);
121 }
122 let est = hll.count();
123 assert!((800..=1200).contains(&est));
125 }
126
127 #[test]
128 fn test_precision_clamp() {
129 let hll = HyperLogLogV2::new(2);
130 assert_eq!(hll.precision(), 4 );
131 }
132
133 #[test]
134 fn test_num_registers() {
135 let hll = HyperLogLogV2::new(8);
136 assert_eq!(hll.num_registers(), 256 );
137 }
138
139 #[test]
140 fn test_merge_same_precision() {
141 let mut a = HyperLogLogV2::new(8);
142 let mut b = HyperLogLogV2::new(8);
143 for i in 0u64..500 {
144 a.add(i);
145 }
146 for i in 500u64..1000 {
147 b.add(i);
148 }
149 a.merge(&b);
150 let est = a.count();
151 assert!(est >= 700 );
152 }
153
154 #[test]
155 fn test_merge_different_precision_ignored() {
156 let mut a = HyperLogLogV2::new(8);
157 let b = HyperLogLogV2::new(10);
158 let before = a.count();
160 a.merge(&b);
161 assert_eq!(a.count(), before);
162 }
163
164 #[test]
165 fn test_high_precision() {
166 let hll = HyperLogLogV2::new(16);
167 assert_eq!(hll.num_registers(), 65536 );
168 }
169
170 #[test]
171 fn test_duplicate_inserts() {
172 let mut hll = HyperLogLogV2::new(8);
173 for _ in 0..100 {
174 hll.add(42 );
175 }
176 let est = hll.count();
177 assert!(est <= 10);
179 }
180}