Skip to main content

hashiverse_lib/tools/
hyper_log_log.rs

1//! # Compact probabilistic cardinality estimation
2//!
3//! A HyperLogLog sketch backed by Blake3 hashing. Used where we need an approximate count
4//! of distinct items (for example, unique viewers of a post or unique voters on a piece of
5//! feedback) without storing the items themselves — the sketch is a fixed 64 bytes
6//! regardless of how many distinct inputs have been seen.
7//!
8//! Configured with 2^6 = 64 registers (yielding ~7.8% standard error), which keeps the
9//! sketch small enough to embed freely in RPC responses and on-disk records while still
10//! being accurate enough for UI-grade counts.
11
12use crate::tools::hashing;
13
14/// A HyperLogLog probabilistic cardinality estimator.
15///
16/// Uses 64 registers (2^6), giving ~7.8% standard error.
17/// Backed by Blake3 hashing.
18const NUM_REGISTERS_POWER: u8 = 6;
19const NUM_REGISTERS: usize = 1 << NUM_REGISTERS_POWER; // 64
20
21/// Alpha constant for bias correction with m=64
22const ALPHA: f64 = 0.709;
23
24#[derive(Clone, Debug)]
25pub struct HyperLogLog {
26    registers: [u8; NUM_REGISTERS],
27}
28
29impl Default for HyperLogLog {
30    fn default() -> Self {
31        Self::new()
32    }
33}
34
35impl HyperLogLog {
36    pub fn new() -> Self {
37        Self { registers: [0; NUM_REGISTERS] }
38    }
39
40    pub fn insert(&mut self, data: &[u8]) {
41        let hash = hashing::hash(data);
42        let hash_bytes = hash.0;
43
44        // Use first byte bits for register index (we need 6 bits for 64 registers)
45        let register_index = (hash_bytes[0] as usize) & (NUM_REGISTERS - 1);
46
47        // Count leading zeros in the remaining bits (starting from bit 6 onward).
48        // We reconstruct a u64 from bytes 1..9 for the leading-zero count.
49        let remaining_value = u64::from_be_bytes([
50            hash_bytes[1], hash_bytes[2], hash_bytes[3], hash_bytes[4],
51            hash_bytes[5], hash_bytes[6], hash_bytes[7], hash_bytes[8],
52        ]);
53
54        // leading_zeros + 1 (HLL convention: rho function)
55        let leading_zeros_plus_one = (remaining_value.leading_zeros() + 1) as u8;
56
57        if leading_zeros_plus_one > self.registers[register_index] {
58            self.registers[register_index] = leading_zeros_plus_one;
59        }
60    }
61
62    pub fn count(&self) -> u64 {
63        let num_registers_f64 = NUM_REGISTERS as f64;
64
65        // Harmonic mean of 2^(-register[i])
66        let harmonic_sum: f64 = self.registers.iter().map(|&register_value| {
67            2.0_f64.powi(-(register_value as i32))
68        }).sum();
69
70        let raw_estimate = ALPHA * num_registers_f64 * num_registers_f64 / harmonic_sum;
71
72        // Small range correction: use linear counting if estimate is small and there are zero registers
73        if raw_estimate <= 2.5 * num_registers_f64 {
74            let num_zero_registers = self.registers.iter().filter(|&&v| v == 0).count();
75            if num_zero_registers > 0 {
76                // Linear counting
77                let linear_count = num_registers_f64 * (num_registers_f64 / num_zero_registers as f64).ln();
78                return linear_count as u64;
79            }
80        }
81
82        raw_estimate as u64
83    }
84
85    pub fn merge(&mut self, other: &HyperLogLog) {
86        for i in 0..NUM_REGISTERS {
87            if other.registers[i] > self.registers[i] {
88                self.registers[i] = other.registers[i];
89            }
90        }
91    }
92}
93
94#[cfg(test)]
95mod tests {
96    use super::*;
97
98    #[test]
99    fn test_hyperloglog_empty() {
100        let hll = HyperLogLog::new();
101        assert_eq!(hll.count(), 0);
102    }
103
104    #[test]
105    fn test_hyperloglog_single_insert() {
106        let mut hll = HyperLogLog::new();
107        hll.insert(b"hello");
108        assert!(hll.count() >= 1);
109    }
110
111    #[test]
112    fn test_hyperloglog_duplicate_inserts() {
113        let mut hll = HyperLogLog::new();
114        for _ in 0..1000 {
115            hll.insert(b"same_value");
116        }
117        // Duplicates should not inflate the count
118        assert_eq!(hll.count(), 1);
119    }
120
121    #[test]
122    fn test_hyperloglog_distinct_values() {
123        let mut hll = HyperLogLog::new();
124        let num_distinct = 1000;
125        for i in 0..num_distinct {
126            hll.insert(format!("value_{}", i).as_bytes());
127        }
128        let estimated_count = hll.count();
129        // With 64 registers (~7.8% standard error), allow 25% tolerance for test stability
130        let lower_bound = (num_distinct as f64 * 0.75) as u64;
131        let upper_bound = (num_distinct as f64 * 1.25) as u64;
132        assert!(
133            estimated_count >= lower_bound && estimated_count <= upper_bound,
134            "Expected count near {}, got {} (bounds: {}-{})", num_distinct, estimated_count, lower_bound, upper_bound
135        );
136    }
137
138    #[test]
139    fn test_hyperloglog_merge() {
140        let mut hll_a = HyperLogLog::new();
141        let mut hll_b = HyperLogLog::new();
142
143        for i in 0..500 {
144            hll_a.insert(format!("a_{}", i).as_bytes());
145        }
146        for i in 0..500 {
147            hll_b.insert(format!("b_{}", i).as_bytes());
148        }
149
150        let count_a = hll_a.count();
151        let count_b = hll_b.count();
152
153        hll_a.merge(&hll_b);
154        let merged_count = hll_a.count();
155
156        // Merged count should be roughly count_a + count_b (no overlap)
157        assert!(merged_count > count_a);
158        assert!(merged_count > count_b);
159        let expected = 1000;
160        let lower_bound = (expected as f64 * 0.75) as u64;
161        let upper_bound = (expected as f64 * 1.25) as u64;
162        assert!(
163            merged_count >= lower_bound && merged_count <= upper_bound,
164            "Expected merged count near {}, got {} (bounds: {}-{})", expected, merged_count, lower_bound, upper_bound
165        );
166    }
167
168    #[test]
169    fn test_hyperloglog_merge_with_overlap() {
170        let mut hll_a = HyperLogLog::new();
171        let mut hll_b = HyperLogLog::new();
172
173        // Both see the same 500 items
174        for i in 0..500 {
175            hll_a.insert(format!("shared_{}", i).as_bytes());
176            hll_b.insert(format!("shared_{}", i).as_bytes());
177        }
178
179        hll_a.merge(&hll_b);
180        let merged_count = hll_a.count();
181
182        // Should still be ~500, not ~1000
183        let lower_bound = (500_f64 * 0.75) as u64;
184        let upper_bound = (500_f64 * 1.25) as u64;
185        assert!(
186            merged_count >= lower_bound && merged_count <= upper_bound,
187            "Expected merged count near 500, got {} (bounds: {}-{})", merged_count, lower_bound, upper_bound
188        );
189    }
190
191    #[test]
192    fn test_hyperloglog_small_counts() {
193        for expected in [2, 5, 10, 20, 50] {
194            let mut hll = HyperLogLog::new();
195            for i in 0..expected {
196                hll.insert(format!("item_{}", i).as_bytes());
197            }
198            let estimated_count = hll.count();
199            // Wider tolerance for small counts
200            let lower_bound = (expected as f64 * 0.5) as u64;
201            let upper_bound = (expected as f64 * 2.0) as u64;
202            assert!(
203                estimated_count >= lower_bound && estimated_count <= upper_bound,
204                "For {} items: expected count near {}, got {} (bounds: {}-{})", expected, expected, estimated_count, lower_bound, upper_bound
205            );
206        }
207    }
208}