Skip to main content

hyperbitbit/
lib.rs

1//! A native rust implementation of a HyperBitBit algorithm introduced by
2//! by Robert Sedgewick in [AC11-Cardinality.pdf](https://www.cs.princeton.edu/~rs/talks/AC11-Cardinality.pdf)
3//!
4//! # Feature
5//! * HyperBitBit, for N < 2^64
6//! * Uses 128 + 8 bit of space
7//! * Estimated cardinality withing 10% or actuals for large N.
8//!
9//! Consider HyperLogLog variants for productions usage, sine this data structure
10//! extensively studied, merge able and more accurate. HyperBitBit is extremely
11//! cheap and fast alternative in expense of accuracy.
12//!
13//! # Usage
14//!
15//! This crate is [on crates.io](https://crates.io/crates/hyperbitbit) and
16//! can be used by adding `hyperbitbit` to the dependencies in your
17//! project's `Cargo.toml`.
18//!
19//! ```toml
20//! [dependencies]
21//! hyperbitbit = "0.0.1-alpha.1"
22//! ```
23//! If you want [serde](https://github.com/serde-rs/serde) support, include the feature like this:
24//!
25//! ```toml
26//! [dependencies]
27//! hyperbitbit = { version = "0.0.1-alpha.1", features = ["serde_support"] }
28//! ```
29//!
30//! add this to your crate root:
31//!
32//! ```rust
33//! extern crate hyperbitbit;
34//! ```
35//!
36//! # Example
37//!
38//! Create a HyperBitBit, add more data and estimate cardinality
39//!
40//! ```rust
41//! use hyperbitbit::HyperBitBit;
42//! use rand::distributions::Alphanumeric;
43//! use rand::prelude::*;
44//! use std::collections::HashSet;
45//!
46//! fn main() {
47//!     // create hbb for cardinality estimation
48//!     let mut hbb = HyperBitBit::new();
49//!     // hash set to measure exact cardinality
50//!     let mut items = HashSet::new();
51//!     // fixed seed for RNG for reproducibly results
52//!     let mut rng = StdRng::seed_from_u64(42);
53//!     // put bunch of random strings on hbb and hash set for comparison
54//!     let maxn = 10000;
55//!     for _ in 1..maxn {
56//!         let s = (&mut rng).sample_iter(&Alphanumeric).take(4).collect::<String>();
57//!
58//!         hbb.insert(&s);
59//!         items.insert(s);
60//!     }
61//!     let expected: i64 = items.len() as i64;
62//!     let rel: f64 = (100.0 * (expected - hbb.cardinality() as i64) as f64) / (expected as f64);
63//!
64//!     println!("Actuals cardinality:   {:?}", expected);
65//!     println!("Estimated cardinality: {:?}", hbb.cardinality());
66//!     println!("Error % cardinality:   {:.2}", rel);
67//! }
68//!```
69//! # Lincese
70//!  Licensed under the Apache License, Version 2.0
71
72
73use std::collections::hash_map::DefaultHasher;
74use std::hash::{Hash, Hasher};
75
76#[cfg(feature="serde_support")]
77extern crate serde;
78
79#[cfg(feature="serde_support")]
80use serde::{Serialize, Deserialize};
81
82#[derive(Clone, Copy, Debug)]
83#[cfg_attr(feature = "serde_support", derive(Serialize, Deserialize))]
84pub struct HyperBitBit {
85    lgn: u8,
86    sketch1: u64,
87    sketch2: u64,
88}
89
90impl Default for HyperBitBit {
91    fn default() -> HyperBitBit {
92        HyperBitBit {
93            lgn: 5,
94            sketch1: 0,
95            sketch2: 0,
96        }
97    }
98}
99
100impl HyperBitBit {
101    /// create a new HyperBitBit struct
102    ///
103    /// # Example
104    /// ```
105    /// # use hyperbitbit::HyperBitBit;
106    /// let mut h = HyperBitBit::new();
107    /// ```
108    pub fn new() -> HyperBitBit {
109        Default::default()
110    }
111
112    /// estimate cardinality
113    ///
114    /// # Example
115    /// ```
116    /// # use hyperbitbit::HyperBitBit;
117    /// let mut h = HyperBitBit::new();
118    /// h.insert(&String::from("xxx"));
119    /// println!("{}", h.cardinality());
120    /// ```
121    pub fn cardinality(&self) -> u64 {
122        let exponent: f64 = self.lgn as f64 + 5.4 + (self.sketch1.count_ones() as f64) / 32.0;
123        f64::powf(2.0, exponent) as u64
124    }
125
126    /// add string to HyperBitBit
127    ///
128    /// # Example
129    /// ```
130    /// # use hyperbitbit::HyperBitBit;
131    /// let mut h = HyperBitBit::new();
132    /// h.insert(&String::from("xxx"));
133    /// ```
134    pub fn insert(&mut self, v: &str) {
135        let mut hasher = DefaultHasher::new();
136        v.hash(&mut hasher);
137        let hash_val: u64 = hasher.finish();
138
139        let k: u64 = (hash_val << 58) >> 58;
140        let r: u64 = ((hash_val >> 6).leading_zeros() - 6).into();
141
142        if r > self.lgn.into() {
143            self.sketch1 |= 1_u64 << k
144        }
145
146        if r > (self.lgn + 1).into() {
147            self.sketch2 |= 1_u64 << k
148        }
149        if self.sketch1.count_ones() > 31 {
150            self.sketch1 = self.sketch2;
151            self.sketch2 = 0;
152            self.lgn += 1;
153        }
154    }
155}
156
157#[cfg(test)]
158mod tests {
159    extern crate rand;
160    extern crate serde_json;
161
162    use rand::distributions::Alphanumeric;
163    use rand::prelude::*;
164    use std::collections::HashSet;
165    use super::HyperBitBit;
166
167    #[test]
168    fn test_basic() {
169        let mut h = HyperBitBit::new();
170        // HyperBitBit is not working for small cardinalities
171        assert_eq!(1351, h.cardinality());
172        h.insert(&String::from("xxx"));
173        h.insert(&String::from("yyy"));
174        assert_eq!(1351, h.cardinality());
175    }
176
177    #[test]
178    fn test_cardinality() {
179        let mut h = HyperBitBit::new();
180        let mut items = HashSet::new();
181
182        assert_eq!(1351, h.cardinality());
183
184        let mut rng = StdRng::seed_from_u64(42);
185        let maxn = 10000;
186        for _ in 1..=maxn {
187            let s = (&mut rng).sample_iter(&Alphanumeric).take(4).collect::<String>();
188
189            h.insert(&s);
190            items.insert(s);
191        }
192        let expected: i64 = items.len() as i64;
193        let rel: f64 = (100.0 * (expected - h.cardinality() as i64) as f64) / (expected as f64);
194        assert!(rel < 10.0);
195    }
196
197    #[cfg(feature = "serde")]
198    #[test]
199    fn test_serde() {
200        let mut h = HyperBitBit::new();
201        h.insert(&String::from("xxx"));
202
203        let serialized_h = serde_json::to_string(&h).unwrap();
204        let other_h: HyperBitBit = serde_json::from_str(&serialized_h).unwrap();
205
206        assert_eq!(h.cardinality(), other_h.cardinality());
207        assert_eq!(h.sketch1, other_h.sketch1);
208        assert_eq!(h.sketch2, other_h.sketch2);
209        assert_eq!(h.lgn, other_h.lgn);
210    }
211}