1use crate::frequency::hll_data;
6use siphasher::sip::SipHasher13;
7use std::cmp::Ordering::{Equal, Greater, Less};
8use std::hash::{Hash, Hasher};
9use std::iter::repeat;
10
11#[cfg_attr(feature = "with_serde", derive(serde::Serialize, serde::Deserialize))]
12#[derive(Clone, Debug)]
13pub struct HyperLogLog {
14 alpha: f64,
15 p: u8,
16 m: usize,
17 M: Vec<u8>,
18 key0: u64,
19 key1: u64,
20}
21
22impl Default for HyperLogLog {
23 fn default() -> Self {
24 Self::new(0.05)
25 }
26}
27
28impl HyperLogLog {
29 pub fn new(error_rate: f64) -> Self {
30 let key0 = rand::random();
31 let key1 = rand::random();
32 Self::new_from_keys(error_rate, key0, key1)
33 }
34
35 pub fn new_from_keys(error_rate: f64, key0: u64, key1: u64) -> Self {
36 assert!(error_rate > 0.0 && error_rate < 1.0);
37 let sr = 1.04 / error_rate;
38 let p = f64::ln(sr * sr).ceil() as u8;
39 assert!(p <= 64);
40 let alpha = Self::get_alpha(p);
41 let m = 1usize << p;
42
43 HyperLogLog {
44 alpha,
45 p,
46 m,
47 M: repeat(0u8).take(m).collect(),
48 key0,
49 key1,
50 }
51 }
52
53 fn hash<V2>(&self, value: &V2) -> u64
54 where
55 V2: Hash,
56 {
57 let mut sip = SipHasher13::new_with_keys(self.key0, self.key1);
58 value.hash(&mut sip);
59 sip.finish()
60 }
61
62 pub fn new_from_template(hll: &HyperLogLog) -> Self {
63 HyperLogLog {
64 alpha: hll.alpha,
65 p: hll.p,
66 m: hll.m,
67 key0: hll.key0,
68 key1: hll.key1,
69 M: repeat(0u8).take(hll.m).collect(),
70 }
71 }
72
73 pub fn insert<V>(&mut self, value: &V)
74 where
75 V: Hash,
76 {
77 let x = self.hash(value);
78 let j = x as usize & (self.m - 1);
79 let w = x >> self.p;
80 let rho = Self::get_rho(w, 64 - self.p);
81 let mjr = &mut self.M[j];
82 if rho > *mjr {
83 *mjr = rho;
84 }
85 }
86
87 pub fn len(&self) -> f64 {
88 let V = Self::vec_count_zero(&self.M);
89 if V > 0 {
90 let H = self.m as f64 * (self.m as f64 / V as f64).ln();
91 if H <= Self::get_threshold(self.p) {
92 H
93 } else {
94 self.ep()
95 }
96 } else {
97 self.ep()
98 }
99 }
100
101 pub fn is_empty(&self) -> bool {
102 self.len() == 0.0
103 }
104
105 pub fn merge(&mut self, src: &HyperLogLog) {
106 assert!(src.p == self.p);
107 assert!(src.m == self.m);
108
109 for i in 0..self.m {
110 let (src_mir, mir) = (src.M[i], &mut self.M[i]);
111 if src_mir > *mir {
112 *mir = src_mir;
113 }
114 }
115 }
116
117 pub fn clear(&mut self) {
118 self.M.iter_mut().all(|x| {
119 *x = 0;
120 true
121 });
122 }
123
124 fn get_threshold(p: u8) -> f64 {
125 hll_data::THRESHOLD_DATA[p as usize]
126 }
127
128 fn get_alpha(p: u8) -> f64 {
129 assert!(p >= 4 && p <= 16);
130 match p {
131 4 => 0.673,
132 5 => 0.697,
133 6 => 0.709,
134 _ => 0.7213 / (1.0 + 1.079 / (1usize << (p as usize)) as f64),
135 }
136 }
137
138 fn bit_length(x: u64) -> u8 {
139 let mut bits: u8 = 0;
140 let mut xm = x;
141 while xm != 0 {
142 bits += 1;
143 xm >>= 1;
144 }
145 bits
146 }
147
148 fn get_rho(w: u64, max_width: u8) -> u8 {
149 let rho = max_width - Self::bit_length(w) + 1;
150 assert!(rho > 0);
151 rho
152 }
153
154 fn vec_count_zero(v: &[u8]) -> usize {
155 bytecount::count(v, 0)
156 }
157
158 fn estimate_bias(E: f64, p: u8) -> f64 {
159 let bias_vector = hll_data::BIAS_DATA[(p - 4) as usize];
160 let nearest_neighbors =
161 Self::get_nearest_neighbors(E, hll_data::RAW_ESTIMATE_DATA[(p - 4) as usize]);
162 let sum = nearest_neighbors
163 .iter()
164 .fold(0.0, |acc, &neighbor| acc + bias_vector[neighbor]);
165 sum / nearest_neighbors.len() as f64
166 }
167
168 fn get_nearest_neighbors(E: f64, estimate_vector: &[f64]) -> Vec<usize> {
169 let ev_len = estimate_vector.len();
170 let mut r: Vec<(f64, usize)> = repeat((0.0f64, 0usize)).take(ev_len).collect();
171 for i in 0..ev_len {
172 let dr = E - estimate_vector[i];
173 r[i] = (dr * dr, i);
174 }
175 r.sort_by(|a, b| {
176 if a < b {
177 Less
178 } else if a > b {
179 Greater
180 } else {
181 Equal
182 }
183 });
184 r.truncate(6);
185 r.iter()
186 .map(|&ez| {
187 let (_, b) = ez;
188 b
189 })
190 .collect()
191 }
192
193 fn ep(&self) -> f64 {
194 let sum = self
195 .M
196 .iter()
197 .fold(0.0, |acc, &x| acc + 2.0f64.powi(-(x as i32)));
198 let E = self.alpha * (self.m * self.m) as f64 / sum;
199 if E <= (5 * self.m) as f64 {
200 E - Self::estimate_bias(E, self.p)
201 } else {
202 E
203 }
204 }
205}
206
207#[test]
208fn hyperloglog_test_simple() {
209 let mut hll = HyperLogLog::new(0.00408);
210 let keys = ["test1", "test2", "test3", "test2", "test2", "test2"];
211 for k in &keys {
212 hll.insert(k);
213 }
214 assert!((hll.len().round() - 3.0).abs() < std::f64::EPSILON);
215 assert!(!hll.is_empty());
216 hll.clear();
217 assert!(hll.is_empty());
218 assert!(hll.len() == 0.0);
219}
220
221#[test]
222fn hyperloglog_test_merge() {
223 let mut hll = HyperLogLog::new(0.00408);
224 let keys = ["test1", "test2", "test3", "test2", "test2", "test2"];
225 for k in &keys {
226 hll.insert(k);
227 }
228 assert!((hll.len().round() - 3.0).abs() < std::f64::EPSILON);
229
230 let mut hll2 = HyperLogLog::new_from_template(&hll);
231 let keys2 = ["test3", "test4", "test4", "test4", "test4", "test1"];
232 for k in &keys2 {
233 hll2.insert(k);
234 }
235 assert!((hll2.len().round() - 3.0).abs() < std::f64::EPSILON);
236
237 hll.merge(&hll2);
238 assert!((hll.len().round() - 4.0).abs() < std::f64::EPSILON);
239}
240
241#[cfg(feature = "with_serde")]
242#[test]
243fn hyperloglog_test_serialize() {
244 let mut hll = HyperLogLog::new(0.00408);
245 let keys = ["test1", "test2", "test3", "test2", "test2", "test2"];
246 for k in &keys {
247 hll.insert(k);
248 }
249
250 let hll_str: String = serde_json::to_string(&hll).unwrap();
251
252 let hll_de = match serde_json::from_str::<HyperLogLog<String>>(&hll_str) {
253 Ok(hll) => hll,
254 Err(_) => panic!("Failed to deserialize"),
255 };
256
257 assert!((hll.len() - hll_de.len()).abs() < std::f64::EPSILON);
258}
259
260#[test]
261fn hyperloglog_test_merge_with_keys() {
262 let key0 = rand::random();
263 let key1 = rand::random();
264
265 let mut hll = HyperLogLog::new_from_keys(0.00408, key0, key1);
266 let keys = ["test1", "test2", "test3", "test2", "test2", "test2"];
267 for k in &keys {
268 hll.insert(k);
269 }
270 assert!((hll.len().round() - 3.0).abs() < std::f64::EPSILON);
271
272 let mut hll2 = HyperLogLog::new_from_keys(0.00408, key0, key1);
273 let keys2 = ["test3", "test4", "test4", "test4", "test4", "test1"];
274 for k in &keys2 {
275 hll2.insert(k);
276 }
277 assert!((hll2.len().round() - 3.0).abs() < std::f64::EPSILON);
278
279 hll.merge(&hll2);
280 assert!((hll.len().round() - 4.0).abs() < std::f64::EPSILON);
281}