use crate::SipHasherBuilder;
#[cfg(feature = "serde")]
use serde_crate::{Deserialize, Serialize};
use std::borrow::Borrow;
use std::cmp;
use std::f64;
use std::fmt::Debug;
use std::hash::BuildHasher;
use std::hash::{Hash, Hasher};
use std::marker::PhantomData;
#[cfg_attr(
feature = "serde",
derive(Deserialize, Serialize),
serde(crate = "serde_crate")
)]
pub struct HyperLogLog<T, B = SipHasherBuilder> {
alpha: f64,
p: usize,
registers: Vec<u8>,
hash_builder: B,
_marker: PhantomData<T>,
}
impl<T> HyperLogLog<T> {
pub fn new(error_probability: f64) -> Self {
Self::with_hasher(error_probability, SipHasherBuilder::from_entropy())
}
}
impl<T, B> HyperLogLog<T, B>
where
B: BuildHasher,
{
fn get_alpha(p: usize) -> f64 {
assert!(4 <= p && p <= 16);
match p {
4 => 0.673,
5 => 0.697,
6 => 0.709,
p => 0.7213 / (1.0 + 1.079 / f64::from(1 << p)),
}
}
pub fn with_hasher(error_probability: f64, hash_builder: B) -> Self {
assert!(0.0 < error_probability && error_probability < 1.0);
let p = (1.04 / error_probability).powi(2).ln().ceil() as usize;
let alpha = Self::get_alpha(p);
let registers_len = 1 << p;
HyperLogLog {
alpha,
p,
registers: vec![0; registers_len],
hash_builder,
_marker: PhantomData,
}
}
pub fn insert<U>(&mut self, item: &U)
where
T: Borrow<U>,
U: Hash + ?Sized,
{
let mut hasher = self.hash_builder.build_hasher();
item.hash(&mut hasher);
let hash = hasher.finish();
let register_index = hash as usize & (self.registers.len() - 1);
let value = (!hash >> self.p).trailing_zeros() as u8;
self.registers[register_index] = cmp::max(self.registers[register_index], value + 1);
}
pub fn merge(&mut self, other: &HyperLogLog<T, B>)
where
B: Debug + PartialEq,
{
assert_eq!(self.p, other.p);
assert_eq!(self.hash_builder, other.hash_builder);
for (index, value) in self.registers.iter_mut().enumerate() {
*value = cmp::max(*value, other.registers[index]);
}
}
fn get_estimate(&self) -> f64 {
let len = self.registers.len() as f64;
1.0 / (self.alpha
* len
* len
* self
.registers
.iter()
.map(|value| 1.0 / 2.0f64.powi(i32::from(*value)))
.sum::<f64>())
}
pub fn len(&self) -> f64 {
let len = self.registers.len() as f64;
match self.get_estimate() {
x if x <= 2.5 * len => {
let zeros = self
.registers
.iter()
.map(|value| if *value == 0 { 1 } else { 0 })
.sum::<u64>();
len * (len / zeros as f64).ln()
}
x if x <= 1.0 / 3.0 * 2.0f64.powi(32) => x,
x => -(2.0f64.powi(32)) * (1.0 - x / 2.0f64.powi(32)).ln(),
}
}
pub fn is_empty(&self) -> bool {
self.len() < f64::EPSILON
}
pub fn clear(&mut self) {
for value in &mut self.registers {
*value = 0;
}
}
pub fn hasher(&self) -> &B {
&self.hash_builder
}
}
#[cfg(test)]
mod tests {
use super::HyperLogLog;
use crate::util::tests::hash_builder_1;
use std::f64::EPSILON;
#[test]
#[should_panic]
fn test_panic_new_invalid_error_probability() {
let _hhl = HyperLogLog::<u32>::new(0.0);
}
#[test]
#[should_panic]
fn test_panic_new_mismatch_error_iprobability() {
let mut hhl1 = HyperLogLog::<u32>::new(0.1);
let hhl2 = HyperLogLog::<u32>::new(0.2);
hhl1.merge(&hhl2);
}
#[test]
fn test_simple() {
let mut hhl = HyperLogLog::<u32>::with_hasher(0.01, hash_builder_1());
assert!(hhl.is_empty());
assert!(hhl.len() < EPSILON);
for key in &[0, 1, 2, 0, 1, 2] {
hhl.insert(&key);
}
assert!(!hhl.is_empty());
assert!((hhl.len().round() - 3.0).abs() < EPSILON);
hhl.clear();
assert!(hhl.is_empty());
}
#[test]
fn test_merge() {
let mut hhl1 = HyperLogLog::<u32>::with_hasher(0.01, hash_builder_1());
for key in &[0, 1, 2, 0, 1, 2] {
hhl1.insert(&key);
}
let mut hhl2 = HyperLogLog::<u32>::with_hasher(0.01, *hhl1.hasher());
for key in &[0, 1, 3, 0, 1, 3] {
hhl2.insert(&key);
}
assert!((hhl1.len().round() - 3.0).abs() < EPSILON);
assert!((hhl2.len().round() - 3.0).abs() < EPSILON);
hhl1.merge(&hhl2);
assert!((hhl1.len().round() - 4.0).abs() < EPSILON);
}
#[cfg(feature = "serde")]
#[test]
fn test_ser_de() {
let mut hhl = HyperLogLog::<u32>::new(0.01);
for key in &[0, 1, 2, 0, 1, 2] {
hhl.insert(&key);
}
let serialized_hhl = bincode::serialize(&hhl).unwrap();
let de_hhl: HyperLogLog<u32> = bincode::deserialize(&serialized_hhl).unwrap();
assert!((hhl.len() - de_hhl.len()).abs() < EPSILON);
assert!((hhl.alpha - de_hhl.alpha).abs() < EPSILON);
assert_eq!(hhl.p, de_hhl.p);
assert_eq!(hhl.registers, de_hhl.registers);
assert_eq!(hhl.hasher(), de_hhl.hasher());
}
}