Skip to main content

oxihuman_core/
hyperloglog.rs

1// Copyright (C) 2026 COOLJAPAN OU (Team KitaSan)
2// SPDX-License-Identifier: Apache-2.0
3
4//! HyperLogLog cardinality estimator.
5
6#![allow(dead_code)]
7
8/// HyperLogLog with configurable precision (b bits for register count).
9#[allow(dead_code)]
10pub struct HyperLogLog {
11    registers: Vec<u8>,
12    b: usize,
13    m: usize,
14}
15
16fn fnv1a(data: &[u8]) -> u64 {
17    const FNV_PRIME: u64 = 0x00000100_000001B3;
18    const FNV_OFFSET: u64 = 0xcbf29ce4_84222325;
19    let mut h = FNV_OFFSET;
20    for &byte in data {
21        h ^= byte as u64;
22        h = h.wrapping_mul(FNV_PRIME);
23    }
24    h
25}
26
27fn leading_zeros_plus_one(x: u64, bits: usize) -> u8 {
28    let shifted = x << bits;
29    if shifted == 0 {
30        (64 - bits + 1) as u8
31    } else {
32        shifted.leading_zeros() as u8 + 1
33    }
34}
35
36/// Alpha correction factor for HyperLogLog.
37fn alpha(m: usize) -> f64 {
38    match m {
39        16 => 0.673,
40        32 => 0.697,
41        64 => 0.709,
42        _ => 0.7213 / (1.0 + 1.079 / m as f64),
43    }
44}
45
46impl HyperLogLog {
47    /// Create with precision b (4..=18). m = 2^b registers.
48    #[allow(dead_code)]
49    pub fn new(b: usize) -> Self {
50        let b = b.clamp(4, 18);
51        let m = 1usize << b;
52        Self {
53            registers: vec![0u8; m],
54            b,
55            m,
56        }
57    }
58
59    #[allow(dead_code)]
60    pub fn add(&mut self, data: &[u8]) {
61        let hash = fnv1a(data);
62        let index = (hash >> (64 - self.b)) as usize;
63        let w = hash << self.b | ((1u64 << self.b) - 1);
64        let rho = leading_zeros_plus_one(w, self.b);
65        if rho > self.registers[index] {
66            self.registers[index] = rho;
67        }
68    }
69
70    /// Estimate the number of distinct elements.
71    #[allow(dead_code)]
72    pub fn count(&self) -> u64 {
73        let m = self.m as f64;
74        let alpha = alpha(self.m);
75        let sum: f64 = self
76            .registers
77            .iter()
78            .map(|&r| 2.0_f64.powi(-(r as i32)))
79            .sum();
80        let estimate = alpha * m * m / sum;
81
82        let zero_count = self.registers.iter().filter(|&&r| r == 0).count();
83        if estimate < 2.5 * m && zero_count > 0 {
84            (m * (m / zero_count as f64).ln()) as u64
85        } else {
86            estimate as u64
87        }
88    }
89
90    #[allow(dead_code)]
91    pub fn merge(&mut self, other: &HyperLogLog) {
92        if self.m != other.m {
93            return;
94        }
95        for (a, &b) in self.registers.iter_mut().zip(other.registers.iter()) {
96            if b > *a {
97                *a = b;
98            }
99        }
100    }
101
102    #[allow(dead_code)]
103    pub fn reset(&mut self) {
104        for r in &mut self.registers {
105            *r = 0;
106        }
107    }
108
109    #[allow(dead_code)]
110    pub fn precision(&self) -> usize {
111        self.b
112    }
113
114    #[allow(dead_code)]
115    pub fn num_registers(&self) -> usize {
116        self.m
117    }
118}
119
120#[cfg(test)]
121mod tests {
122    use super::*;
123
124    #[test]
125    fn test_empty_count_zero() {
126        let hll = HyperLogLog::new(10);
127        assert_eq!(hll.count(), 0);
128    }
129
130    #[test]
131    fn test_single_element() {
132        let mut hll = HyperLogLog::new(10);
133        hll.add(b"hello");
134        assert!(hll.count() >= 1);
135    }
136
137    #[test]
138    fn test_distinct_estimate_approximate() {
139        let mut hll = HyperLogLog::new(12);
140        for i in 0u32..1000 {
141            hll.add(&i.to_le_bytes());
142        }
143        let est = hll.count();
144        assert!(est > 800 && est < 1200, "estimate={est}");
145    }
146
147    #[test]
148    fn test_duplicates_not_overcounted() {
149        let mut hll = HyperLogLog::new(12);
150        for _ in 0..100 {
151            hll.add(b"same");
152        }
153        let est = hll.count();
154        assert!(est < 10, "estimate={est}");
155    }
156
157    #[test]
158    fn test_merge() {
159        let mut hll1 = HyperLogLog::new(10);
160        let mut hll2 = HyperLogLog::new(10);
161        for i in 0u32..100 {
162            hll1.add(&i.to_le_bytes());
163        }
164        for i in 100u32..200 {
165            hll2.add(&i.to_le_bytes());
166        }
167        hll1.merge(&hll2);
168        let est = hll1.count();
169        /* HyperLogLog is probabilistic; allow ~30% error around 200 */
170        assert!((100..=300).contains(&est), "estimate={est}");
171    }
172
173    #[test]
174    fn test_reset() {
175        let mut hll = HyperLogLog::new(10);
176        hll.add(b"x");
177        hll.reset();
178        assert_eq!(hll.count(), 0);
179    }
180
181    #[test]
182    fn test_precision_clamp() {
183        let hll = HyperLogLog::new(2);
184        assert_eq!(hll.precision(), 4);
185        let hll2 = HyperLogLog::new(20);
186        assert_eq!(hll2.precision(), 18);
187    }
188
189    #[test]
190    fn test_num_registers() {
191        let hll = HyperLogLog::new(8);
192        assert_eq!(hll.num_registers(), 256);
193    }
194
195    #[test]
196    fn test_merge_same_data() {
197        let mut hll1 = HyperLogLog::new(10);
198        let mut hll2 = HyperLogLog::new(10);
199        hll1.add(b"a");
200        hll2.add(b"a");
201        hll1.merge(&hll2);
202        let est = hll1.count();
203        assert!(est < 5, "estimate={est}");
204    }
205
206    #[test]
207    fn test_add_empty() {
208        let mut hll = HyperLogLog::new(10);
209        hll.add(b"");
210        assert!(hll.count() >= 1);
211    }
212}