probably/frequency/
hll.rs

1// (C)opyleft 2013-2019 Frank Denis
2// Virtually all of this code comes from https://github.com/jedisct1/rust-hyperloglog
3
4//! HyperLogLog implementation for Rust
5use crate::frequency::hll_data;
6use siphasher::sip::SipHasher13;
7use std::cmp::Ordering::{Equal, Greater, Less};
8use std::hash::{Hash, Hasher};
9use std::iter::repeat;
10
11#[cfg_attr(feature = "with_serde", derive(serde::Serialize, serde::Deserialize))]
12#[derive(Clone, Debug)]
13pub struct HyperLogLog {
14    alpha: f64,
15    p: u8,
16    m: usize,
17    M: Vec<u8>,
18    key0: u64,
19    key1: u64,
20}
21
22impl Default for HyperLogLog {
23    fn default() -> Self {
24        Self::new(0.05)
25    }
26}
27
28impl HyperLogLog {
29    pub fn new(error_rate: f64) -> Self {
30        let key0 = rand::random();
31        let key1 = rand::random();
32        Self::new_from_keys(error_rate, key0, key1)
33    }
34
35    pub fn new_from_keys(error_rate: f64, key0: u64, key1: u64) -> Self {
36        assert!(error_rate > 0.0 && error_rate < 1.0);
37        let sr = 1.04 / error_rate;
38        let p = f64::ln(sr * sr).ceil() as u8;
39        assert!(p <= 64);
40        let alpha = Self::get_alpha(p);
41        let m = 1usize << p;
42
43        HyperLogLog {
44            alpha,
45            p,
46            m,
47            M: repeat(0u8).take(m).collect(),
48            key0,
49            key1,
50        }
51    }
52
53    fn hash<V2>(&self, value: &V2) -> u64
54    where
55        V2: Hash,
56    {
57        let mut sip = SipHasher13::new_with_keys(self.key0, self.key1);
58        value.hash(&mut sip);
59        sip.finish()
60    }
61
62    pub fn new_from_template(hll: &HyperLogLog) -> Self {
63        HyperLogLog {
64            alpha: hll.alpha,
65            p: hll.p,
66            m: hll.m,
67            key0: hll.key0,
68            key1: hll.key1,
69            M: repeat(0u8).take(hll.m).collect(),
70        }
71    }
72
73    pub fn insert<V>(&mut self, value: &V)
74    where
75        V: Hash,
76    {
77        let x = self.hash(value);
78        let j = x as usize & (self.m - 1);
79        let w = x >> self.p;
80        let rho = Self::get_rho(w, 64 - self.p);
81        let mjr = &mut self.M[j];
82        if rho > *mjr {
83            *mjr = rho;
84        }
85    }
86
87    pub fn len(&self) -> f64 {
88        let V = Self::vec_count_zero(&self.M);
89        if V > 0 {
90            let H = self.m as f64 * (self.m as f64 / V as f64).ln();
91            if H <= Self::get_threshold(self.p) {
92                H
93            } else {
94                self.ep()
95            }
96        } else {
97            self.ep()
98        }
99    }
100
101    pub fn is_empty(&self) -> bool {
102        self.len() == 0.0
103    }
104
105    pub fn merge(&mut self, src: &HyperLogLog) {
106        assert!(src.p == self.p);
107        assert!(src.m == self.m);
108
109        for i in 0..self.m {
110            let (src_mir, mir) = (src.M[i], &mut self.M[i]);
111            if src_mir > *mir {
112                *mir = src_mir;
113            }
114        }
115    }
116
117    pub fn clear(&mut self) {
118        self.M.iter_mut().all(|x| {
119            *x = 0;
120            true
121        });
122    }
123
124    fn get_threshold(p: u8) -> f64 {
125        hll_data::THRESHOLD_DATA[p as usize]
126    }
127
128    fn get_alpha(p: u8) -> f64 {
129        assert!(p >= 4 && p <= 16);
130        match p {
131            4 => 0.673,
132            5 => 0.697,
133            6 => 0.709,
134            _ => 0.7213 / (1.0 + 1.079 / (1usize << (p as usize)) as f64),
135        }
136    }
137
138    fn bit_length(x: u64) -> u8 {
139        let mut bits: u8 = 0;
140        let mut xm = x;
141        while xm != 0 {
142            bits += 1;
143            xm >>= 1;
144        }
145        bits
146    }
147
148    fn get_rho(w: u64, max_width: u8) -> u8 {
149        let rho = max_width - Self::bit_length(w) + 1;
150        assert!(rho > 0);
151        rho
152    }
153
154    fn vec_count_zero(v: &[u8]) -> usize {
155        bytecount::count(v, 0)
156    }
157
158    fn estimate_bias(E: f64, p: u8) -> f64 {
159        let bias_vector = hll_data::BIAS_DATA[(p - 4) as usize];
160        let nearest_neighbors =
161            Self::get_nearest_neighbors(E, hll_data::RAW_ESTIMATE_DATA[(p - 4) as usize]);
162        let sum = nearest_neighbors
163            .iter()
164            .fold(0.0, |acc, &neighbor| acc + bias_vector[neighbor]);
165        sum / nearest_neighbors.len() as f64
166    }
167
168    fn get_nearest_neighbors(E: f64, estimate_vector: &[f64]) -> Vec<usize> {
169        let ev_len = estimate_vector.len();
170        let mut r: Vec<(f64, usize)> = repeat((0.0f64, 0usize)).take(ev_len).collect();
171        for i in 0..ev_len {
172            let dr = E - estimate_vector[i];
173            r[i] = (dr * dr, i);
174        }
175        r.sort_by(|a, b| {
176            if a < b {
177                Less
178            } else if a > b {
179                Greater
180            } else {
181                Equal
182            }
183        });
184        r.truncate(6);
185        r.iter()
186            .map(|&ez| {
187                let (_, b) = ez;
188                b
189            })
190            .collect()
191    }
192
193    fn ep(&self) -> f64 {
194        let sum = self
195            .M
196            .iter()
197            .fold(0.0, |acc, &x| acc + 2.0f64.powi(-(x as i32)));
198        let E = self.alpha * (self.m * self.m) as f64 / sum;
199        if E <= (5 * self.m) as f64 {
200            E - Self::estimate_bias(E, self.p)
201        } else {
202            E
203        }
204    }
205}
206
207#[test]
208fn hyperloglog_test_simple() {
209    let mut hll = HyperLogLog::new(0.00408);
210    let keys = ["test1", "test2", "test3", "test2", "test2", "test2"];
211    for k in &keys {
212        hll.insert(k);
213    }
214    assert!((hll.len().round() - 3.0).abs() < std::f64::EPSILON);
215    assert!(!hll.is_empty());
216    hll.clear();
217    assert!(hll.is_empty());
218    assert!(hll.len() == 0.0);
219}
220
221#[test]
222fn hyperloglog_test_merge() {
223    let mut hll = HyperLogLog::new(0.00408);
224    let keys = ["test1", "test2", "test3", "test2", "test2", "test2"];
225    for k in &keys {
226        hll.insert(k);
227    }
228    assert!((hll.len().round() - 3.0).abs() < std::f64::EPSILON);
229
230    let mut hll2 = HyperLogLog::new_from_template(&hll);
231    let keys2 = ["test3", "test4", "test4", "test4", "test4", "test1"];
232    for k in &keys2 {
233        hll2.insert(k);
234    }
235    assert!((hll2.len().round() - 3.0).abs() < std::f64::EPSILON);
236
237    hll.merge(&hll2);
238    assert!((hll.len().round() - 4.0).abs() < std::f64::EPSILON);
239}
240
241#[cfg(feature = "with_serde")]
242#[test]
243fn hyperloglog_test_serialize() {
244    let mut hll = HyperLogLog::new(0.00408);
245    let keys = ["test1", "test2", "test3", "test2", "test2", "test2"];
246    for k in &keys {
247        hll.insert(k);
248    }
249
250    let hll_str: String = serde_json::to_string(&hll).unwrap();
251
252    let hll_de = match serde_json::from_str::<HyperLogLog<String>>(&hll_str) {
253        Ok(hll) => hll,
254        Err(_) => panic!("Failed to deserialize"),
255    };
256
257    assert!((hll.len() - hll_de.len()).abs() < std::f64::EPSILON);
258}
259
260#[test]
261fn hyperloglog_test_merge_with_keys() {
262    let key0 = rand::random();
263    let key1 = rand::random();
264
265    let mut hll = HyperLogLog::new_from_keys(0.00408, key0, key1);
266    let keys = ["test1", "test2", "test3", "test2", "test2", "test2"];
267    for k in &keys {
268        hll.insert(k);
269    }
270    assert!((hll.len().round() - 3.0).abs() < std::f64::EPSILON);
271
272    let mut hll2 = HyperLogLog::new_from_keys(0.00408, key0, key1);
273    let keys2 = ["test3", "test4", "test4", "test4", "test4", "test1"];
274    for k in &keys2 {
275        hll2.insert(k);
276    }
277    assert!((hll2.len().round() - 3.0).abs() < std::f64::EPSILON);
278
279    hll.merge(&hll2);
280    assert!((hll.len().round() - 4.0).abs() < std::f64::EPSILON);
281}