oxihuman_core/
hyperloglog.rs1#![allow(dead_code)]
7
8#[allow(dead_code)]
10pub struct HyperLogLog {
11 registers: Vec<u8>,
12 b: usize,
13 m: usize,
14}
15
16fn fnv1a(data: &[u8]) -> u64 {
17 const FNV_PRIME: u64 = 0x00000100_000001B3;
18 const FNV_OFFSET: u64 = 0xcbf29ce4_84222325;
19 let mut h = FNV_OFFSET;
20 for &byte in data {
21 h ^= byte as u64;
22 h = h.wrapping_mul(FNV_PRIME);
23 }
24 h
25}
26
27fn leading_zeros_plus_one(x: u64, bits: usize) -> u8 {
28 let shifted = x << bits;
29 if shifted == 0 {
30 (64 - bits + 1) as u8
31 } else {
32 shifted.leading_zeros() as u8 + 1
33 }
34}
35
36fn alpha(m: usize) -> f64 {
38 match m {
39 16 => 0.673,
40 32 => 0.697,
41 64 => 0.709,
42 _ => 0.7213 / (1.0 + 1.079 / m as f64),
43 }
44}
45
46impl HyperLogLog {
47 #[allow(dead_code)]
49 pub fn new(b: usize) -> Self {
50 let b = b.clamp(4, 18);
51 let m = 1usize << b;
52 Self {
53 registers: vec![0u8; m],
54 b,
55 m,
56 }
57 }
58
59 #[allow(dead_code)]
60 pub fn add(&mut self, data: &[u8]) {
61 let hash = fnv1a(data);
62 let index = (hash >> (64 - self.b)) as usize;
63 let w = hash << self.b | ((1u64 << self.b) - 1);
64 let rho = leading_zeros_plus_one(w, self.b);
65 if rho > self.registers[index] {
66 self.registers[index] = rho;
67 }
68 }
69
70 #[allow(dead_code)]
72 pub fn count(&self) -> u64 {
73 let m = self.m as f64;
74 let alpha = alpha(self.m);
75 let sum: f64 = self
76 .registers
77 .iter()
78 .map(|&r| 2.0_f64.powi(-(r as i32)))
79 .sum();
80 let estimate = alpha * m * m / sum;
81
82 let zero_count = self.registers.iter().filter(|&&r| r == 0).count();
83 if estimate < 2.5 * m && zero_count > 0 {
84 (m * (m / zero_count as f64).ln()) as u64
85 } else {
86 estimate as u64
87 }
88 }
89
90 #[allow(dead_code)]
91 pub fn merge(&mut self, other: &HyperLogLog) {
92 if self.m != other.m {
93 return;
94 }
95 for (a, &b) in self.registers.iter_mut().zip(other.registers.iter()) {
96 if b > *a {
97 *a = b;
98 }
99 }
100 }
101
102 #[allow(dead_code)]
103 pub fn reset(&mut self) {
104 for r in &mut self.registers {
105 *r = 0;
106 }
107 }
108
109 #[allow(dead_code)]
110 pub fn precision(&self) -> usize {
111 self.b
112 }
113
114 #[allow(dead_code)]
115 pub fn num_registers(&self) -> usize {
116 self.m
117 }
118}
119
120#[cfg(test)]
121mod tests {
122 use super::*;
123
124 #[test]
125 fn test_empty_count_zero() {
126 let hll = HyperLogLog::new(10);
127 assert_eq!(hll.count(), 0);
128 }
129
130 #[test]
131 fn test_single_element() {
132 let mut hll = HyperLogLog::new(10);
133 hll.add(b"hello");
134 assert!(hll.count() >= 1);
135 }
136
137 #[test]
138 fn test_distinct_estimate_approximate() {
139 let mut hll = HyperLogLog::new(12);
140 for i in 0u32..1000 {
141 hll.add(&i.to_le_bytes());
142 }
143 let est = hll.count();
144 assert!(est > 800 && est < 1200, "estimate={est}");
145 }
146
147 #[test]
148 fn test_duplicates_not_overcounted() {
149 let mut hll = HyperLogLog::new(12);
150 for _ in 0..100 {
151 hll.add(b"same");
152 }
153 let est = hll.count();
154 assert!(est < 10, "estimate={est}");
155 }
156
157 #[test]
158 fn test_merge() {
159 let mut hll1 = HyperLogLog::new(10);
160 let mut hll2 = HyperLogLog::new(10);
161 for i in 0u32..100 {
162 hll1.add(&i.to_le_bytes());
163 }
164 for i in 100u32..200 {
165 hll2.add(&i.to_le_bytes());
166 }
167 hll1.merge(&hll2);
168 let est = hll1.count();
169 assert!((100..=300).contains(&est), "estimate={est}");
171 }
172
173 #[test]
174 fn test_reset() {
175 let mut hll = HyperLogLog::new(10);
176 hll.add(b"x");
177 hll.reset();
178 assert_eq!(hll.count(), 0);
179 }
180
181 #[test]
182 fn test_precision_clamp() {
183 let hll = HyperLogLog::new(2);
184 assert_eq!(hll.precision(), 4);
185 let hll2 = HyperLogLog::new(20);
186 assert_eq!(hll2.precision(), 18);
187 }
188
189 #[test]
190 fn test_num_registers() {
191 let hll = HyperLogLog::new(8);
192 assert_eq!(hll.num_registers(), 256);
193 }
194
195 #[test]
196 fn test_merge_same_data() {
197 let mut hll1 = HyperLogLog::new(10);
198 let mut hll2 = HyperLogLog::new(10);
199 hll1.add(b"a");
200 hll2.add(b"a");
201 hll1.merge(&hll2);
202 let est = hll1.count();
203 assert!(est < 5, "estimate={est}");
204 }
205
206 #[test]
207 fn test_add_empty() {
208 let mut hll = HyperLogLog::new(10);
209 hll.add(b"");
210 assert!(hll.count() >= 1);
211 }
212}