hyperbitbit/lib.rs
1//! A native rust implementation of a HyperBitBit algorithm introduced by
2//! by Robert Sedgewick in [AC11-Cardinality.pdf](https://www.cs.princeton.edu/~rs/talks/AC11-Cardinality.pdf)
3//!
4//! # Feature
5//! * HyperBitBit, for N < 2^64
6//! * Uses 128 + 8 bit of space
7//! * Estimated cardinality withing 10% or actuals for large N.
8//!
9//! Consider HyperLogLog variants for productions usage, sine this data structure
10//! extensively studied, merge able and more accurate. HyperBitBit is extremely
11//! cheap and fast alternative in expense of accuracy.
12//!
13//! # Usage
14//!
15//! This crate is [on crates.io](https://crates.io/crates/hyperbitbit) and
16//! can be used by adding `hyperbitbit` to the dependencies in your
17//! project's `Cargo.toml`.
18//!
19//! ```toml
20//! [dependencies]
21//! hyperbitbit = "0.0.1-alpha.1"
22//! ```
23//! If you want [serde](https://github.com/serde-rs/serde) support, include the feature like this:
24//!
25//! ```toml
26//! [dependencies]
27//! hyperbitbit = { version = "0.0.1-alpha.1", features = ["serde_support"] }
28//! ```
29//!
30//! add this to your crate root:
31//!
32//! ```rust
33//! extern crate hyperbitbit;
34//! ```
35//!
36//! # Example
37//!
38//! Create a HyperBitBit, add more data and estimate cardinality
39//!
40//! ```rust
41//! use hyperbitbit::HyperBitBit;
42//! use rand::distributions::Alphanumeric;
43//! use rand::prelude::*;
44//! use std::collections::HashSet;
45//!
46//! fn main() {
47//! // create hbb for cardinality estimation
48//! let mut hbb = HyperBitBit::new();
49//! // hash set to measure exact cardinality
50//! let mut items = HashSet::new();
51//! // fixed seed for RNG for reproducibly results
52//! let mut rng = StdRng::seed_from_u64(42);
53//! // put bunch of random strings on hbb and hash set for comparison
54//! let maxn = 10000;
55//! for _ in 1..maxn {
56//! let s = (&mut rng).sample_iter(&Alphanumeric).take(4).collect::<String>();
57//!
58//! hbb.insert(&s);
59//! items.insert(s);
60//! }
61//! let expected: i64 = items.len() as i64;
62//! let rel: f64 = (100.0 * (expected - hbb.cardinality() as i64) as f64) / (expected as f64);
63//!
64//! println!("Actuals cardinality: {:?}", expected);
65//! println!("Estimated cardinality: {:?}", hbb.cardinality());
66//! println!("Error % cardinality: {:.2}", rel);
67//! }
68//!```
69//! # Lincese
70//! Licensed under the Apache License, Version 2.0
71
72
73use std::collections::hash_map::DefaultHasher;
74use std::hash::{Hash, Hasher};
75
76#[cfg(feature="serde_support")]
77extern crate serde;
78
79#[cfg(feature="serde_support")]
80use serde::{Serialize, Deserialize};
81
82#[derive(Clone, Copy, Debug)]
83#[cfg_attr(feature = "serde_support", derive(Serialize, Deserialize))]
84pub struct HyperBitBit {
85 lgn: u8,
86 sketch1: u64,
87 sketch2: u64,
88}
89
90impl Default for HyperBitBit {
91 fn default() -> HyperBitBit {
92 HyperBitBit {
93 lgn: 5,
94 sketch1: 0,
95 sketch2: 0,
96 }
97 }
98}
99
100impl HyperBitBit {
101 /// create a new HyperBitBit struct
102 ///
103 /// # Example
104 /// ```
105 /// # use hyperbitbit::HyperBitBit;
106 /// let mut h = HyperBitBit::new();
107 /// ```
108 pub fn new() -> HyperBitBit {
109 Default::default()
110 }
111
112 /// estimate cardinality
113 ///
114 /// # Example
115 /// ```
116 /// # use hyperbitbit::HyperBitBit;
117 /// let mut h = HyperBitBit::new();
118 /// h.insert(&String::from("xxx"));
119 /// println!("{}", h.cardinality());
120 /// ```
121 pub fn cardinality(&self) -> u64 {
122 let exponent: f64 = self.lgn as f64 + 5.4 + (self.sketch1.count_ones() as f64) / 32.0;
123 f64::powf(2.0, exponent) as u64
124 }
125
126 /// add string to HyperBitBit
127 ///
128 /// # Example
129 /// ```
130 /// # use hyperbitbit::HyperBitBit;
131 /// let mut h = HyperBitBit::new();
132 /// h.insert(&String::from("xxx"));
133 /// ```
134 pub fn insert(&mut self, v: &str) {
135 let mut hasher = DefaultHasher::new();
136 v.hash(&mut hasher);
137 let hash_val: u64 = hasher.finish();
138
139 let k: u64 = (hash_val << 58) >> 58;
140 let r: u64 = ((hash_val >> 6).leading_zeros() - 6).into();
141
142 if r > self.lgn.into() {
143 self.sketch1 |= 1_u64 << k
144 }
145
146 if r > (self.lgn + 1).into() {
147 self.sketch2 |= 1_u64 << k
148 }
149 if self.sketch1.count_ones() > 31 {
150 self.sketch1 = self.sketch2;
151 self.sketch2 = 0;
152 self.lgn += 1;
153 }
154 }
155}
156
157#[cfg(test)]
158mod tests {
159 extern crate rand;
160 extern crate serde_json;
161
162 use rand::distributions::Alphanumeric;
163 use rand::prelude::*;
164 use std::collections::HashSet;
165 use super::HyperBitBit;
166
167 #[test]
168 fn test_basic() {
169 let mut h = HyperBitBit::new();
170 // HyperBitBit is not working for small cardinalities
171 assert_eq!(1351, h.cardinality());
172 h.insert(&String::from("xxx"));
173 h.insert(&String::from("yyy"));
174 assert_eq!(1351, h.cardinality());
175 }
176
177 #[test]
178 fn test_cardinality() {
179 let mut h = HyperBitBit::new();
180 let mut items = HashSet::new();
181
182 assert_eq!(1351, h.cardinality());
183
184 let mut rng = StdRng::seed_from_u64(42);
185 let maxn = 10000;
186 for _ in 1..=maxn {
187 let s = (&mut rng).sample_iter(&Alphanumeric).take(4).collect::<String>();
188
189 h.insert(&s);
190 items.insert(s);
191 }
192 let expected: i64 = items.len() as i64;
193 let rel: f64 = (100.0 * (expected - h.cardinality() as i64) as f64) / (expected as f64);
194 assert!(rel < 10.0);
195 }
196
197 #[cfg(feature = "serde")]
198 #[test]
199 fn test_serde() {
200 let mut h = HyperBitBit::new();
201 h.insert(&String::from("xxx"));
202
203 let serialized_h = serde_json::to_string(&h).unwrap();
204 let other_h: HyperBitBit = serde_json::from_str(&serialized_h).unwrap();
205
206 assert_eq!(h.cardinality(), other_h.cardinality());
207 assert_eq!(h.sketch1, other_h.sketch1);
208 assert_eq!(h.sketch2, other_h.sketch2);
209 assert_eq!(h.lgn, other_h.lgn);
210 }
211}