Skip to main content

rust_hll/
lib.rs

1use dense::DenseRegisters;
2use explicit::ExplicitStorage;
3pub use settings::{Settings, SettingsError};
4use sparse::SparseRegisters;
5use thiserror::Error;
6
7mod dense;
8#[cfg(test)]
9mod dense_test;
10mod explicit;
11#[cfg(test)]
12mod integration_test;
13mod settings;
14mod sparse;
15#[cfg(test)]
16mod sparse_test;
17mod utils;
18
19/// `Register` is an add-on interface to storage that is implemented by the probabalistic types.
20trait Registers {
21    fn log_2m(&self) -> u32;
22    fn pw_max_mask(&self) -> u64;
23    fn m_bits_mask(&self) -> u64;
24
25    /// set_if_greater sets the register value of register reg_num to the provided value if and only if it's greater than
26    /// the current value.
27    fn set_if_greater(&mut self, reg_num: u32, value: u8);
28
29    /// indicator computes the "indicator function" (Z in the HLL paper).  It additionally returns the number of
30    /// registers whose value is zero (V in the paper).  The returned values are used to drive cardinality calculations.
31    ///
32    /// For reference, Z = indicator(2^(-M[j])) for all j from 0 -> num registers where M[j] is the register value.
33    fn indicator(&self) -> (f64, u32);
34
35    /// calculates the register and value to use when calling `set_if_greater`. Returns None if
36    /// value is 0.
37    fn set(&mut self, value: u64) {
38        // following documentation courtesy of the java implementation:
39        //
40        // p(w): position of the least significant set bit (one-indexed)
41        // By contract: p(w) <= 2^(registerValueInBits) - 1 (the max register
42        // value)
43        //
44        // By construction of pw_max_mask,
45        //      lsb(pw_max_mask) = 2^(registerValueInBits) - 2,
46        // thus lsb(any_long | pw_max_mask) <= 2^(registerValueInBits) - 2,
47        // thus 1 + lsb(any_long | pw_max_mask) <= 2^(registerValueInBits) -1.
48        let substream_value = value >> self.log_2m();
49        if substream_value == 0 {
50            // The paper does not cover p(0x0), so the special value 0 is used.
51            // 0 is the original initialization value of the registers, so by
52            // doing this the multiset simply ignores it. This is acceptable
53            // because the probability is 1/(2^(2^registerSizeInBits)).
54            return;
55        }
56
57        // NOTE : trailing zeros == the 0-based index of the least significant 1
58        //        bit.
59        let p_w = (1 + (substream_value | self.pw_max_mask()).trailing_zeros()) as u8;
60        // NOTE:  no +1 as in paper since 0-based indexing
61        let i = value & self.m_bits_mask();
62
63        // this is safe because the m_bits_mask is 1 less bit in length than log_2m bits
64        self.set_if_greater(i as u32, p_w);
65    }
66}
67
68pub trait Storage {
69    fn bytes_size(&self) -> usize;
70    fn to_bytes(&self, buf: &mut [u8]);
71    fn from_bytes(settings: &Settings, buf: &[u8]) -> Self;
72    fn clear(&mut self);
73}
74
75#[derive(Clone, Debug, Error)]
76pub enum HllError {
77    #[error("{0}")]
78    Settings(#[from] SettingsError),
79    #[error("invalid version {0}")]
80    Version(u8),
81}
82
83#[derive(Clone, Debug, PartialEq)]
84pub enum Hll {
85    Empty(Settings),
86    Explicit(ExplicitStorage),
87    Sparse(SparseRegisters),
88    Dense(DenseRegisters),
89}
90
91impl Hll {
92    pub fn new(settings: Settings) -> Self {
93        Hll::Empty(settings)
94    }
95
96    pub fn add_raw(&mut self, value: u64) {
97        if value == 0 {
98            return;
99        }
100
101        if let Hll::Empty(settings) = self {
102            if settings.explicit_threshold() > 0 {
103                *self = Hll::Explicit(ExplicitStorage::with_settings(settings));
104            } else if settings.sparse_threshold.is_some() {
105                let registers = SparseRegisters::with_settings(settings);
106                *self = Hll::Sparse(registers);
107            } else {
108                *self = Hll::Dense(DenseRegisters::with_settings(settings));
109            }
110        }
111
112        match self {
113            Hll::Explicit(explicit_registers) => {
114                explicit_registers.set(value);
115                if explicit_registers.is_full() {
116                    *self = explicit_registers.as_registers();
117                }
118            }
119            Hll::Sparse(sparse_registers) => {
120                sparse_registers.set(value);
121
122                if sparse_registers.is_full() {
123                    *self = Hll::Dense(sparse_registers.to_dense(None));
124                }
125            }
126            Hll::Dense(dense_registers) => {
127                dense_registers.set(value);
128            }
129            _ => {}
130        }
131    }
132
133    pub fn union(&mut self, strict: bool, other: &Self) -> Result<(), HllError> {
134        if strict {
135            self.settings_check(other)?;
136        }
137
138        match self {
139            Hll::Empty(settings) => {
140                *self = match &other {
141                    Hll::Sparse(sparse_registers) => match settings.sparse_threshold {
142                        Some(sparse_threshold) => {
143                            if sparse_threshold < sparse_registers.len() as i32 {
144                                Hll::Dense(sparse_registers.to_dense(Some(settings)))
145                            } else {
146                                Hll::Sparse(sparse_registers.clone())
147                            }
148                        }
149                        None => Hll::Dense(sparse_registers.to_dense(Some(settings))),
150                    },
151                    _ => other.clone(),
152                };
153            }
154            Hll::Explicit(lhs) => match other {
155                Hll::Empty(_settings) => {}
156                Hll::Explicit(rhs) => {
157                    lhs.union_explicit(rhs);
158                }
159                Hll::Sparse(_sparse_registers) => {
160                    let mut new_storage = lhs.as_registers();
161                    new_storage.union(strict, other)?;
162
163                    *self = new_storage;
164                }
165                Hll::Dense(_dense_registers) => {
166                    let mut new_storage = lhs.as_registers();
167                    new_storage.union(strict, other)?;
168
169                    *self = new_storage;
170                }
171            },
172            Hll::Sparse(sparse_registers) => match other {
173                Hll::Empty(_settings) => {}
174                Hll::Explicit(explicit_storage) => {
175                    sparse_registers.union_explicit(explicit_storage);
176                }
177                Hll::Sparse(rhs_sparse_registers) => {
178                    sparse_registers.union_sparse(rhs_sparse_registers);
179                }
180                Hll::Dense(dense_registers) => {
181                    let mut new_storage = sparse_registers.to_dense(None);
182                    new_storage.union_dense(dense_registers);
183
184                    *self = Hll::Dense(new_storage);
185                }
186            },
187            Hll::Dense(dense_registers) => match other {
188                Hll::Empty(_settings) => {}
189                Hll::Explicit(explicit_storage) => {
190                    dense_registers.union_explicit(explicit_storage);
191                }
192                Hll::Sparse(sparse_registers) => {
193                    dense_registers.union_sparse(sparse_registers);
194                }
195                Hll::Dense(rhs_dense_registers) => {
196                    dense_registers.union_dense(rhs_dense_registers);
197                }
198            },
199        }
200
201        if self.is_full() {
202            self.upgrade();
203        }
204
205        Ok(())
206    }
207
208    pub fn cardinality(&self) -> u64 {
209        let (sum, num_of_zeros) = match self {
210            Hll::Empty(_) => return 0,
211            Hll::Explicit(explicit_storage) => return explicit_storage.len(),
212            Hll::Sparse(sparse_registers) => sparse_registers.indicator(),
213            Hll::Dense(dense_registers) => dense_registers.indicator(),
214        };
215
216        let settings = self.settings();
217
218        // apply the estimate and correction to the indicator function
219        let estimator = settings.alpha_msquared / sum;
220
221        if (num_of_zeros != 0) && (estimator < settings.small_estimator_cutoff) {
222            // following documentation courtesy of the java implementation:
223            // The "small range correction" formula from the HyperLogLog
224            // algorithm. Only appropriate if both the estimator is smaller than
225            // (5/2) * m and there are still registers that have the zero value.
226            let num_of_zeros = num_of_zeros as f64;
227            let m: f64 = (1 << settings.log_2m).into();
228            let small_estimator = m * (m / num_of_zeros).ln();
229            return small_estimator.ceil() as u64;
230        }
231
232        if estimator <= settings.large_estimator_cutoff {
233            return estimator.ceil() as u64;
234        }
235
236        // following documentation courtesy of the java implementation:
237        // The "large range correction" formula from the HyperLogLog algorithm,
238        // adapted for 64 bit hashes. Only appropriate for estimators whose
239        // value exceeds the calculated cutoff.
240        let large_estimator =
241            -1.0 * settings.two_to_l * (1.0 - (estimator / settings.two_to_l)).ln();
242        large_estimator.ceil() as u64
243    }
244
245    fn is_full(&self) -> bool {
246        match self {
247            Hll::Empty(_) => false,
248            Hll::Explicit(explicit_storage) => explicit_storage.is_full(),
249            Hll::Sparse(sparse_registers) => sparse_registers.is_full(),
250            Hll::Dense(_) => false,
251        }
252    }
253
254    fn upgrade(&mut self) {
255        match self {
256            Hll::Empty(_) => {}
257            Hll::Explicit(explicit_storage) => {
258                *self = explicit_storage.as_registers();
259            }
260            Hll::Sparse(sparse_registers) => {
261                *self = Hll::Dense(sparse_registers.to_dense(None));
262            }
263            Hll::Dense(_) => {}
264        }
265    }
266
267    pub fn settings_check(&self, other: &Self) -> Result<(), SettingsError> {
268        self.settings().settings_check(other.settings())
269    }
270
271    pub fn settings(&self) -> &Settings {
272        match self {
273            Hll::Empty(settings) => settings,
274            Hll::Explicit(explicit_storage) => &explicit_storage.settings,
275            Hll::Sparse(sparse_registers) => &sparse_registers.settings,
276            Hll::Dense(dense_registers) => &dense_registers.settings,
277        }
278    }
279
280    pub fn clone_with_settings(&self, settings: &Settings) -> Self {
281        match self {
282            Hll::Empty(_) => Hll::Empty(*settings),
283            Hll::Explicit(explicit_storage) => {
284                Hll::Explicit(explicit_storage.clone_with_settings(settings))
285            }
286            Hll::Sparse(sparse_registers) => {
287                Hll::Sparse(sparse_registers.clone_with_settings(settings))
288            }
289            Hll::Dense(dense_registers) => {
290                Hll::Dense(dense_registers.clone_with_settings(settings))
291            }
292        }
293    }
294
295    pub fn type_id(&self) -> u8 {
296        match self {
297            Hll::Empty(_) => 1,
298            Hll::Explicit(_) => 2,
299            Hll::Sparse(_) => 3,
300            Hll::Dense(_) => 4,
301        }
302    }
303
304    pub fn to_bytes(&self) -> Vec<u8> {
305        let (settings, type_id, size) = match self {
306            Hll::Empty(settings) => (settings, 1, 0),
307            Hll::Explicit(explicit_storage) => {
308                (&explicit_storage.settings, 2, explicit_storage.bytes_size())
309            }
310            Hll::Sparse(sparse_registers) => {
311                (&sparse_registers.settings, 3, sparse_registers.bytes_size())
312            }
313            Hll::Dense(dense_registers) => {
314                (&dense_registers.settings, 4, dense_registers.bytes_size())
315            }
316        };
317        let mut res: Vec<u8> = vec![0; 3 + size];
318
319        res[0] = (1 << 4) | type_id;
320        res[1] = (((settings.reg_width - 1) << 5) | settings.log_2m) as u8;
321        res[2] = settings.pack_cutoff_byte();
322
323        match self {
324            Hll::Empty(_settings) => {}
325            Hll::Explicit(explicit_storage) => {
326                explicit_storage.to_bytes(&mut res[3..]);
327            }
328            Hll::Sparse(sparse_registers) => {
329                sparse_registers.to_bytes(&mut res[3..]);
330            }
331            Hll::Dense(dense_registers) => {
332                dense_registers.to_bytes(&mut res[3..]);
333            }
334        }
335
336        res
337    }
338
339    pub fn from_bytes(buf: &[u8]) -> Result<Self, HllError> {
340        let version = buf[0] >> 4;
341        let type_id = buf[0] & 0x0F;
342
343        if version != 1 {
344            return Err(HllError::Version(version));
345        }
346
347        let reg_width = (buf[1] >> 5) + 1;
348        let log_2m = buf[1] & 0x1F;
349        let (sparse_enabled, explicit_threshold) = Settings::unpack_cutoff_byte(buf[2]);
350
351        let settings = Settings::new(
352            log_2m as u32,
353            reg_width as u32,
354            explicit_threshold,
355            sparse_enabled,
356        )?;
357
358        let storage = match type_id {
359            1 => Self::Empty(settings),
360            2 => Self::Explicit(ExplicitStorage::from_bytes(&settings, &buf[3..])),
361            3 => Self::Sparse(SparseRegisters::from_bytes(&settings, &buf[3..])),
362            4 => Self::Dense(DenseRegisters::from_bytes(&settings, &buf[3..])),
363            _ => {
364                return Err(HllError::Version(type_id));
365            }
366        };
367
368        Ok(storage)
369    }
370
371    pub fn clear(&mut self) {
372        match self {
373            Hll::Empty(_) => {}
374            Hll::Explicit(explicit_storage) => explicit_storage.clear(),
375            Hll::Sparse(sparse_registers) => sparse_registers.clear(),
376            Hll::Dense(dense_registers) => dense_registers.clear(),
377        }
378    }
379}
380
381#[cfg(test)]
382mod tests {
383    use super::*;
384
385    #[test]
386    fn test_hll() {
387        // Create settings for the HLL
388        let settings = Settings::new(
389            10,   // log_2m: number of registers will be 2^10
390            4,    // reg_width: 4 bits per register
391            -1,   // explicit_threshold: auto-calculate threshold
392            true, // sparse_enabled: use sparse representation
393        )
394        .unwrap();
395
396        // Create a new HLL with the settings
397        let mut hll = Hll::new(settings);
398
399        // Add elements
400        hll.add_raw(123456789);
401        println!("Cardinality: {}", hll.cardinality()); // prints "1"
402
403        // Create another HLL and add elements
404        let mut hll2 = Hll::new(settings);
405        hll2.add_raw(123456789);
406        hll2.add_raw(987654321);
407
408        // Union HLLs
409        hll2.union(true, &hll).unwrap();
410        println!("Cardinality after union: {}", hll2.cardinality()); // prints "2"
411
412        // Serialize to bytes
413        let bytes = hll2.to_bytes();
414
415        // Deserialize from bytes
416        let hll3 = Hll::from_bytes(&bytes).unwrap();
417        println!("Cardinality after deserialization: {}", hll3.cardinality()); // prints "2"
418    }
419}