hashiverse_lib/tools/
hyper_log_log.rs1use crate::tools::hashing;
13
14const NUM_REGISTERS_POWER: u8 = 6;
19const NUM_REGISTERS: usize = 1 << NUM_REGISTERS_POWER; const ALPHA: f64 = 0.709;
23
24#[derive(Clone, Debug)]
25pub struct HyperLogLog {
26 registers: [u8; NUM_REGISTERS],
27}
28
29impl Default for HyperLogLog {
30 fn default() -> Self {
31 Self::new()
32 }
33}
34
35impl HyperLogLog {
36 pub fn new() -> Self {
37 Self { registers: [0; NUM_REGISTERS] }
38 }
39
40 pub fn insert(&mut self, data: &[u8]) {
41 let hash = hashing::hash(data);
42 let hash_bytes = hash.0;
43
44 let register_index = (hash_bytes[0] as usize) & (NUM_REGISTERS - 1);
46
47 let remaining_value = u64::from_be_bytes([
50 hash_bytes[1], hash_bytes[2], hash_bytes[3], hash_bytes[4],
51 hash_bytes[5], hash_bytes[6], hash_bytes[7], hash_bytes[8],
52 ]);
53
54 let leading_zeros_plus_one = (remaining_value.leading_zeros() + 1) as u8;
56
57 if leading_zeros_plus_one > self.registers[register_index] {
58 self.registers[register_index] = leading_zeros_plus_one;
59 }
60 }
61
62 pub fn count(&self) -> u64 {
63 let num_registers_f64 = NUM_REGISTERS as f64;
64
65 let harmonic_sum: f64 = self.registers.iter().map(|®ister_value| {
67 2.0_f64.powi(-(register_value as i32))
68 }).sum();
69
70 let raw_estimate = ALPHA * num_registers_f64 * num_registers_f64 / harmonic_sum;
71
72 if raw_estimate <= 2.5 * num_registers_f64 {
74 let num_zero_registers = self.registers.iter().filter(|&&v| v == 0).count();
75 if num_zero_registers > 0 {
76 let linear_count = num_registers_f64 * (num_registers_f64 / num_zero_registers as f64).ln();
78 return linear_count as u64;
79 }
80 }
81
82 raw_estimate as u64
83 }
84
85 pub fn merge(&mut self, other: &HyperLogLog) {
86 for i in 0..NUM_REGISTERS {
87 if other.registers[i] > self.registers[i] {
88 self.registers[i] = other.registers[i];
89 }
90 }
91 }
92}
93
94#[cfg(test)]
95mod tests {
96 use super::*;
97
98 #[test]
99 fn test_hyperloglog_empty() {
100 let hll = HyperLogLog::new();
101 assert_eq!(hll.count(), 0);
102 }
103
104 #[test]
105 fn test_hyperloglog_single_insert() {
106 let mut hll = HyperLogLog::new();
107 hll.insert(b"hello");
108 assert!(hll.count() >= 1);
109 }
110
111 #[test]
112 fn test_hyperloglog_duplicate_inserts() {
113 let mut hll = HyperLogLog::new();
114 for _ in 0..1000 {
115 hll.insert(b"same_value");
116 }
117 assert_eq!(hll.count(), 1);
119 }
120
121 #[test]
122 fn test_hyperloglog_distinct_values() {
123 let mut hll = HyperLogLog::new();
124 let num_distinct = 1000;
125 for i in 0..num_distinct {
126 hll.insert(format!("value_{}", i).as_bytes());
127 }
128 let estimated_count = hll.count();
129 let lower_bound = (num_distinct as f64 * 0.75) as u64;
131 let upper_bound = (num_distinct as f64 * 1.25) as u64;
132 assert!(
133 estimated_count >= lower_bound && estimated_count <= upper_bound,
134 "Expected count near {}, got {} (bounds: {}-{})", num_distinct, estimated_count, lower_bound, upper_bound
135 );
136 }
137
138 #[test]
139 fn test_hyperloglog_merge() {
140 let mut hll_a = HyperLogLog::new();
141 let mut hll_b = HyperLogLog::new();
142
143 for i in 0..500 {
144 hll_a.insert(format!("a_{}", i).as_bytes());
145 }
146 for i in 0..500 {
147 hll_b.insert(format!("b_{}", i).as_bytes());
148 }
149
150 let count_a = hll_a.count();
151 let count_b = hll_b.count();
152
153 hll_a.merge(&hll_b);
154 let merged_count = hll_a.count();
155
156 assert!(merged_count > count_a);
158 assert!(merged_count > count_b);
159 let expected = 1000;
160 let lower_bound = (expected as f64 * 0.75) as u64;
161 let upper_bound = (expected as f64 * 1.25) as u64;
162 assert!(
163 merged_count >= lower_bound && merged_count <= upper_bound,
164 "Expected merged count near {}, got {} (bounds: {}-{})", expected, merged_count, lower_bound, upper_bound
165 );
166 }
167
168 #[test]
169 fn test_hyperloglog_merge_with_overlap() {
170 let mut hll_a = HyperLogLog::new();
171 let mut hll_b = HyperLogLog::new();
172
173 for i in 0..500 {
175 hll_a.insert(format!("shared_{}", i).as_bytes());
176 hll_b.insert(format!("shared_{}", i).as_bytes());
177 }
178
179 hll_a.merge(&hll_b);
180 let merged_count = hll_a.count();
181
182 let lower_bound = (500_f64 * 0.75) as u64;
184 let upper_bound = (500_f64 * 1.25) as u64;
185 assert!(
186 merged_count >= lower_bound && merged_count <= upper_bound,
187 "Expected merged count near 500, got {} (bounds: {}-{})", merged_count, lower_bound, upper_bound
188 );
189 }
190
191 #[test]
192 fn test_hyperloglog_small_counts() {
193 for expected in [2, 5, 10, 20, 50] {
194 let mut hll = HyperLogLog::new();
195 for i in 0..expected {
196 hll.insert(format!("item_{}", i).as_bytes());
197 }
198 let estimated_count = hll.count();
199 let lower_bound = (expected as f64 * 0.5) as u64;
201 let upper_bound = (expected as f64 * 2.0) as u64;
202 assert!(
203 estimated_count >= lower_bound && estimated_count <= upper_bound,
204 "For {} items: expected count near {}, got {} (bounds: {}-{})", expected, expected, estimated_count, lower_bound, upper_bound
205 );
206 }
207 }
208}