use std::borrow::Borrow;
use std::collections::HashSet;
use std::hash::{BuildHasher, Hash, Hasher};
use std::marker::PhantomData;
use serde::{Deserialize, Serialize};
use crate::common::*;
use crate::constants;
use crate::encoding::DifIntVec;
use crate::HyperLogLog;
use crate::HyperLogLogError;
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct HyperLogLogPlus<H, B>
where
H: Hash + ?Sized,
B: BuildHasher,
{
builder: B,
precision: u8,
counts: (usize, usize, usize),
tmpset: HashSet<u32>,
sparse: DifIntVec,
registers: Option<RegistersPlus>,
phantom: PhantomData<H>,
}
impl<H, B> HyperLogLogPlus<H, B>
where
H: Hash + ?Sized,
B: BuildHasher,
{
const MIN_PRECISION: u8 = 4;
const MAX_PRECISION: u8 = 18;
const PRIME_PRECISION: u8 = 25;
pub fn new(precision: u8, builder: B) -> Result<Self, HyperLogLogError> {
if precision < Self::MIN_PRECISION || precision > Self::MAX_PRECISION {
return Err(HyperLogLogError::InvalidPrecision);
}
let count = Self::register_count(precision);
let counts = (
count,
Self::register_count(Self::PRIME_PRECISION - 1),
RegistersPlus::size_in_bytes(count),
);
Ok(HyperLogLogPlus {
builder: builder,
precision: precision,
counts: counts,
tmpset: HashSet::new(),
sparse: DifIntVec::new(),
registers: None,
phantom: PhantomData,
})
}
pub fn merge<S, T>(
&mut self,
other: &HyperLogLogPlus<S, T>,
) -> Result<(), HyperLogLogError>
where
S: Hash + ?Sized,
T: BuildHasher,
{
if self.precision != other.precision() {
return Err(HyperLogLogError::IncompatiblePrecision);
}
if other.is_sparse() {
if self.is_sparse() {
for hash_code in other.tmpset.iter() {
self.tmpset.insert(*hash_code);
}
for hash_code in other.sparse.into_iter() {
self.tmpset.insert(hash_code);
}
if self.tmpset.len() * 100 > self.counts.2 {
self.merge_sparse()
}
} else {
let registers = self.registers.as_mut().unwrap();
for hash_code in other.tmpset.iter() {
let (zeros, index) = other.decode_hash(*hash_code);
registers.set_greater(index, zeros);
}
for hash_code in other.sparse.into_iter() {
let (zeros, index) = other.decode_hash(hash_code);
registers.set_greater(index, zeros);
}
}
} else {
if self.is_sparse() {
self.merge_sparse();
if self.is_sparse() {
self.sparse_to_normal();
}
}
let registers = self.registers.as_mut().unwrap();
let other_registers_iter = other.registers_iter().unwrap();
for (i, val) in other_registers_iter.enumerate() {
registers.set_greater(i, val);
}
}
Ok(())
}
pub fn insert_any<R>(&mut self, value: &R)
where
R: Hash + ?Sized,
{
self.insert_impl(value);
}
#[inline(always)]
fn insert_impl<R>(&mut self, value: &R)
where
R: Hash + ?Sized,
{
let mut hasher = self.builder.build_hasher();
value.hash(&mut hasher);
let mut hash: u64 = hasher.finish();
match &mut self.registers {
Some(registers) => {
let index: usize = (hash >> (64 - self.precision)) as usize;
hash = (hash << self.precision) | (1 << (self.precision - 1));
let zeros: u32 = 1 + hash.leading_zeros();
registers.set_greater(index, zeros);
},
None => {
let hash_code = self.encode_hash(hash);
self.tmpset.insert(hash_code);
if self.tmpset.len() * 100 > self.counts.2 {
self.merge_sparse()
}
},
}
}
#[inline] fn precision(&self) -> u8 {
self.precision
}
#[inline] fn registers_iter(&self) -> Option<impl Iterator<Item = u32> + '_> {
self.registers
.as_ref()
.and_then(|registers| Some(registers.iter()))
}
#[inline] fn is_sparse(&self) -> bool {
self.registers.is_none()
}
#[inline] fn encode_hash(&self, mut hash: u64) -> u32 {
let index: u64 = u64::extract(hash, 64, 64 - Self::PRIME_PRECISION);
let dif: u64 =
u64::extract(hash, 64 - self.precision, 64 - Self::PRIME_PRECISION);
if dif == 0 {
hash = (hash << Self::PRIME_PRECISION) |
(1 << Self::PRIME_PRECISION - 1);
let zeros: u32 = 1 + hash.leading_zeros();
return ((index as u32) << 7) | (zeros << 1) | 1;
}
(index << 1) as u32
}
#[inline] fn index(&self, hash_code: u32) -> usize {
if hash_code & 1 == 1 {
return u32::extract(hash_code, 32, 32 - self.precision) as usize;
}
u32::extract(
hash_code,
Self::PRIME_PRECISION + 1,
Self::PRIME_PRECISION - self.precision + 1,
) as usize
}
#[inline] fn decode_hash(&self, hash_code: u32) -> (u32, usize) {
if hash_code & 1 == 1 {
return (
u32::extract(hash_code, 7, 1) +
(Self::PRIME_PRECISION - self.precision) as u32,
self.index(hash_code),
);
}
let hash =
hash_code << (32 - Self::PRIME_PRECISION + self.precision - 1);
(hash.leading_zeros() + 1, self.index(hash_code))
}
fn sparse_to_normal(&mut self) {
let mut registers: RegistersPlus =
RegistersPlus::with_count(self.counts.0);
for hash_code in self.sparse.into_iter() {
let (zeros, index) = self.decode_hash(hash_code);
registers.set_greater(index, zeros);
}
self.registers = Some(registers);
self.tmpset.clear();
self.sparse.clear();
}
fn merge_sparse(&mut self) {
let mut set_codes: Vec<u32> = self.tmpset.iter().copied().collect();
set_codes.sort();
let mut buf = DifIntVec::with_capacity(self.sparse.len());
let (mut set_iter, mut buf_iter) =
(set_codes.iter(), self.sparse.into_iter());
let (mut set_hash_option, mut buf_hash_option) =
(set_iter.next(), buf_iter.next());
while set_hash_option.is_some() || buf_hash_option.is_some() {
if set_hash_option.is_none() {
buf.push(buf_hash_option.unwrap());
buf_hash_option = buf_iter.next();
continue;
}
if buf_hash_option.is_none() {
buf.push(*set_hash_option.unwrap());
set_hash_option = set_iter.next();
continue;
}
let (set_hash_code, buf_hash_code) =
(*set_hash_option.unwrap(), buf_hash_option.unwrap());
if set_hash_code == buf_hash_code {
buf.push(set_hash_code);
set_hash_option = set_iter.next();
buf_hash_option = buf_iter.next();
} else if set_hash_code > buf_hash_code {
buf.push(buf_hash_code);
buf_hash_option = buf_iter.next();
} else {
buf.push(set_hash_code);
set_hash_option = set_iter.next();
}
}
self.sparse = buf;
self.tmpset.clear();
if self.sparse.len() > self.counts.2 {
self.sparse_to_normal();
}
}
fn estimate_bias(&self, raw: f64) -> f64 {
let biases = &constants::BIAS_DATA
[(self.precision - Self::MIN_PRECISION) as usize];
let estimates = &constants::RAW_ESTIMATE_DATA
[(self.precision - Self::MIN_PRECISION) as usize];
if raw <= estimates[0] {
return biases[0];
} else if estimates[estimates.len() - 1] <= raw {
return biases[biases.len() - 1];
}
let res =
estimates.binary_search_by(|est| est.partial_cmp(&raw).unwrap());
let (prv, idx) = match res {
Ok(idx) => (idx - 1, idx),
Err(idx) => (idx - 1, idx),
};
let ratio = (raw - estimates[prv]) / (estimates[idx] - estimates[prv]);
biases[prv] + ratio * (biases[idx] - biases[prv])
}
#[inline] fn threshold(precision: u8) -> f64 {
match precision {
4 => 10.0,
5 => 10.0,
6 => 40.0,
7 => 80.0,
8 => 220.0,
9 => 400.0,
10 => 900.0,
11 => 1800.0,
12 => 3100.0,
13 => 6500.0,
14 => 11500.0,
15 => 20000.0,
16 => 50000.0,
17 => 120000.0,
18 => 350000.0,
_ => unreachable!(),
}
}
}
impl<H, B> HyperLogLogCommon for HyperLogLogPlus<H, B>
where
H: Hash + ?Sized,
B: BuildHasher,
{
}
impl<H, B> HyperLogLog<H> for HyperLogLogPlus<H, B>
where
H: Hash + ?Sized,
B: BuildHasher,
{
fn add(&mut self, value: &H) {
self.insert_impl(value);
}
fn insert<Q>(&mut self, value: &Q)
where
H: Borrow<Q>,
Q: Hash + ?Sized,
{
self.insert_impl(value);
}
fn count(&mut self) -> f64 {
if self.registers.is_none() {
self.merge_sparse();
}
match self.registers.as_mut() {
Some(registers) => {
let zeros = registers.zeros();
if zeros != 0 {
let correction = Self::linear_count(self.counts.0, zeros);
if correction <= Self::threshold(self.precision) {
correction
} else {
let mut raw = Self::estimate_raw_plus(
registers.iter(),
self.counts.0,
);
if raw <= 5.0 * self.counts.0 as f64 {
raw -= self.estimate_bias(raw);
}
raw
}
} else {
let mut raw = Self::estimate_raw_plus(
registers.iter(),
self.counts.0,
);
if raw <= 5.0 * self.counts.0 as f64 {
raw -= self.estimate_bias(raw);
}
raw
}
},
None => {
let zeros = self.counts.1 - self.sparse.count();
Self::linear_count(self.counts.1, zeros)
},
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::collections::hash_map::DefaultHasher;
use std::hash::{BuildHasher, Hasher};
struct PassThroughHasher(u64);
impl Hasher for PassThroughHasher {
#[inline]
fn finish(&self) -> u64 {
self.0
}
#[inline]
fn write(&mut self, _: &[u8]) {}
#[inline]
fn write_u64(&mut self, i: u64) {
self.0 = i;
}
}
#[derive(Serialize, Deserialize)]
struct PassThroughHasherBuilder;
impl BuildHasher for PassThroughHasherBuilder {
type Hasher = PassThroughHasher;
fn build_hasher(&self) -> Self::Hasher {
PassThroughHasher(0)
}
}
#[derive(Serialize, Deserialize)]
struct DefaultBuildHasher;
impl BuildHasher for DefaultBuildHasher {
type Hasher = DefaultHasher;
fn build_hasher(&self) -> Self::Hasher {
DefaultHasher::new()
}
}
#[test]
fn test_normal_insert() {
let builder = PassThroughHasherBuilder {};
let mut hll: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(16, builder).unwrap();
hll.sparse_to_normal();
assert!(hll.registers.is_some());
hll.insert(&0x00010fffffffffff);
assert_eq!(hll.registers.as_ref().unwrap().get(1), 5);
hll.insert(&0x0002ffffffffffff);
assert_eq!(hll.registers.as_ref().unwrap().get(2), 1);
hll.insert(&0x0003000000000000);
assert_eq!(hll.registers.as_ref().unwrap().get(3), 49);
hll.insert(&0x0003000000000001);
assert_eq!(hll.registers.as_ref().unwrap().get(3), 49);
hll.insert(&0xff03700000000000);
assert_eq!(hll.registers.as_ref().unwrap().get(0xff03), 2);
hll.insert(&0xff03080000000000);
assert_eq!(hll.registers.as_ref().unwrap().get(0xff03), 5);
let builder = PassThroughHasherBuilder {};
let mut hll: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(4, builder).unwrap();
hll.sparse_to_normal();
hll.insert(&0x1fffffffffffffff);
assert_eq!(hll.registers.as_ref().unwrap().get(1), 1);
hll.insert(&0xffffffffffffffff);
assert_eq!(hll.registers.as_ref().unwrap().get(0xf), 1);
hll.insert(&0x00ffffffffffffff);
assert_eq!(hll.registers.as_ref().unwrap().get(0), 5);
}
#[test]
fn test_sparse_encode_hash() {
let builder = PassThroughHasherBuilder {};
let hll: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(14, builder).unwrap();
let index: u64 = 0b0000000000111000000000000;
let hash: u64 = 0b1101;
let hash_code = hll.encode_hash((index << 64 - 25) | hash);
assert_eq!(hash_code, (index << 7) as u32 | (35 + 1 << 1) | 1);
let index: u64 = 0b0000000000111000000000010;
let hash: u64 = 0b1101;
let hash_code = hll.encode_hash((index << 64 - 25) | hash);
assert_eq!(hash_code, (index << 1) as u32);
}
#[test]
fn test_sparse_decode_hash() {
let builder = PassThroughHasherBuilder {};
let hll: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(8, builder).unwrap();
let (zeros, index) =
hll.decode_hash(hll.encode_hash(0xffffff8000000000));
assert_eq!((zeros, index), (1, 0xff));
let (zeros, index) =
hll.decode_hash(hll.encode_hash(0xff00000000000000));
assert_eq!((zeros, index), (57, 0xff));
let (zeros, index) =
hll.decode_hash(hll.encode_hash(0xff30000000000000));
assert_eq!((zeros, index), (3, 0xff));
let (zeros, index) =
hll.decode_hash(hll.encode_hash(0xaa10000000000000));
assert_eq!((zeros, index), (4, 0xaa));
let (zeros, index) =
hll.decode_hash(hll.encode_hash(0xaa0f000000000000));
assert_eq!((zeros, index), (5, 0xaa));
}
#[test]
fn test_sparse_merge_sparse() {
let builder = PassThroughHasherBuilder {};
let mut hll: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(16, builder).unwrap();
let hashes: [u64; 3] =
[0xf000017000000000, 0x000fff8f00000000, 0x0f00017000000000];
let hash_codes: [u32; 3] = [
hll.encode_hash(hashes[0]),
hll.encode_hash(hashes[1]),
hll.encode_hash(hashes[2]),
];
hll.insert(&hashes[0]);
assert!(hll.tmpset.contains(&hash_codes[0]));
hll.insert(&hashes[1]);
assert!(hll.tmpset.contains(&hash_codes[1]));
assert_eq!(hll.tmpset.len(), 2);
assert_eq!(hll.sparse.len(), 0);
hll.merge_sparse();
assert_eq!(hll.sparse.count(), 2);
assert_eq!(hll.tmpset.len(), 0);
let hll_hash_codes: Vec<u32> = hll.sparse.into_iter().collect();
assert_eq!(hll_hash_codes, vec![hash_codes[1], hash_codes[0]]);
hll.insert(&hashes[2]);
assert!(hll.tmpset.contains(&hash_codes[2]));
hll.merge_sparse();
assert_eq!(hll.sparse.count(), 3);
assert_eq!(hll.tmpset.len(), 0);
let hll_hash_codes: Vec<u32> = hll.sparse.into_iter().collect();
assert_eq!(
hll_hash_codes,
vec![hash_codes[1], hash_codes[2], hash_codes[0]]
);
}
#[test]
fn test_sparse_merge_to_normal() {
let builder = PassThroughHasherBuilder {};
let mut hll: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(7, builder).unwrap();
for i in 0u64..102 {
hll.insert(&(i << 39));
hll.count();
}
hll.insert(&1);
assert!(hll.registers.is_none());
hll.count();
assert!(hll.registers.is_some());
assert_eq!(hll.tmpset.len(), 0);
assert_eq!(hll.sparse.len(), 0);
}
#[test]
fn test_sparse_trigger_sparse_to_normal() {
let builder = PassThroughHasherBuilder {};
let mut hll: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(4, builder).unwrap();
for i in 0u64..12 {
hll.insert(&(1 << i));
}
assert!(hll.registers.is_none());
hll.insert(&(1 << 13));
assert!(hll.registers.is_some());
assert_eq!(hll.tmpset.len(), 0);
assert_eq!(hll.sparse.len(), 0);
}
#[test]
fn test_sparse_sparse_to_normal() {
let builder = PassThroughHasherBuilder {};
let mut hll: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(16, builder).unwrap();
hll.insert(&0x00010fffffffffff);
assert_eq!(hll.count() as u64, 1);
hll.merge_sparse();
hll.sparse_to_normal();
assert_eq!(hll.count() as u64, 1);
assert!(hll.registers.is_some());
let builder = PassThroughHasherBuilder {};
let mut hll: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(16, builder).unwrap();
hll.insert(&0x00010fffffffffff);
hll.insert(&0x0002ffffffffffff);
hll.insert(&0x0003000000000000);
hll.insert(&0x0003000000000001);
hll.insert(&0xff03700000000000);
hll.insert(&0xff03080000000000);
hll.merge_sparse();
hll.sparse_to_normal();
assert_eq!(hll.count() as u64, 4);
assert_eq!(hll.registers.as_ref().unwrap().get(1), 5);
assert_eq!(hll.registers.as_ref().unwrap().get(2), 1);
assert_eq!(hll.registers.as_ref().unwrap().get(3), 49);
assert_eq!(hll.registers.as_ref().unwrap().get(0xff03), 5);
}
#[test]
fn test_sparse_count() {
let builder = PassThroughHasherBuilder {};
let mut hll: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(16, builder).unwrap();
let hashes: [u64; 6] = [
0x00010fffffffffff,
0x00020fffffffffff,
0x00030fffffffffff,
0x00040fffffffffff,
0x00050fffffffffff,
0x00050fffffffffff,
];
for hash in &hashes {
hll.insert(hash);
}
hll.count();
let hash_codes: Vec<u32> = hll.sparse.into_iter().collect();
let expected_hash_codes: Vec<u32> =
hashes.iter().map(|hash| hll.encode_hash(*hash)).collect();
assert_eq!(hash_codes.as_slice(), &expected_hash_codes[..5]);
assert_eq!(hll.count() as u64, 5);
}
#[test]
fn test_estimate_bias() {
let builder = PassThroughHasherBuilder {};
let hll: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(4, builder).unwrap();
let bias = hll.estimate_bias(14.0988);
assert!((bias - 7.5988).abs() <= 1e-5);
let bias = hll.estimate_bias(10.0);
assert!((bias - 10.0).abs() < 1e-5);
let bias = hll.estimate_bias(80.0);
assert!((bias - (-1.7606)).abs() < 1e-5);
let builder = PassThroughHasherBuilder {};
let hll: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(16, builder).unwrap();
let bias = hll.estimate_bias(55391.4373);
assert!((bias - 39416.9373).abs() < 1e-5);
let builder = PassThroughHasherBuilder {};
let hll: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(18, builder).unwrap();
let bias = hll.estimate_bias(275468.768);
assert!((bias - 118181.769).abs() <= 1e-3);
let bias = hll.estimate_bias(587532.522);
assert!((bias - 23922.523).abs() <= 1e-3);
let bias = hll.estimate_bias(1205430.993);
assert!((bias - (-434.006000000052)).abs() <= 1e-3);
let bias = hll.estimate_bias(1251260.649);
assert!((bias - (-479.351000000024)).abs() <= 1e-3);
}
#[test]
fn test_estimate_bias_count() {
let builder = PassThroughHasherBuilder {};
let mut hll: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(4, builder).unwrap();
hll.sparse_to_normal();
for i in 0u64..10 {
hll.insert(&((i << 60) + 0xfffffffffffffff));
}
assert!((10.0 - hll.count()).abs() < 1.0);
}
#[test]
fn test_merge_error() {
let mut hll: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(16, PassThroughHasherBuilder {}).unwrap();
let other: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(12, PassThroughHasherBuilder {}).unwrap();
assert_eq!(
hll.merge(&other),
Err(HyperLogLogError::IncompatiblePrecision)
);
}
#[test]
fn test_merge_both_sparse() {
let mut hll: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(16, PassThroughHasherBuilder {}).unwrap();
let mut other: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(16, PassThroughHasherBuilder {}).unwrap();
other.insert(&0x00010fffffffffff);
other.insert(&0x00020fffffffffff);
other.insert(&0x00030fffffffffff);
other.insert(&0x00040fffffffffff);
other.insert(&0x00050fffffffffff);
other.insert(&0x00050fffffffffff);
assert_eq!(other.count().trunc() as u64, 5);
let res = hll.merge(&other);
assert_eq!(res, Ok(()));
assert_eq!(hll.count().trunc() as u64, 5);
assert!(hll.is_sparse() && other.is_sparse());
let res = hll.merge(&other);
assert_eq!(res, Ok(()));
assert_eq!(hll.count().trunc() as u64, 5);
assert!(hll.is_sparse() && other.is_sparse());
other.insert(&0x00060fffffffffff);
other.insert(&0x00070fffffffffff);
other.insert(&0x00080fffffffffff);
other.insert(&0x00090fffffffffff);
other.insert(&0x000a0fffffffffff);
assert_eq!(other.count().trunc() as u64, 10);
let res = hll.merge(&other);
assert_eq!(res, Ok(()));
assert_eq!(hll.count().trunc() as u64, 10);
assert!(hll.is_sparse() && other.is_sparse());
}
#[test]
fn test_merge_both_normal() {
let mut hll: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(16, PassThroughHasherBuilder {}).unwrap();
let mut other: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(16, PassThroughHasherBuilder {}).unwrap();
hll.sparse_to_normal();
other.sparse_to_normal();
other.insert(&0x00010fffffffffff);
other.insert(&0x00020fffffffffff);
other.insert(&0x00030fffffffffff);
other.insert(&0x00040fffffffffff);
other.insert(&0x00050fffffffffff);
other.insert(&0x00050fffffffffff);
assert_eq!(other.count().trunc() as u64, 5);
let res = hll.merge(&other);
assert_eq!(res, Ok(()));
assert_eq!(hll.count().trunc() as u64, 5);
assert!(!hll.is_sparse() && !other.is_sparse());
let res = hll.merge(&other);
assert_eq!(res, Ok(()));
assert_eq!(hll.count().trunc() as u64, 5);
assert!(!hll.is_sparse() && !other.is_sparse());
other.insert(&0x00060fffffffffff);
other.insert(&0x00070fffffffffff);
other.insert(&0x00080fffffffffff);
other.insert(&0x00090fffffffffff);
other.insert(&0x000a0fffffffffff);
assert_eq!(other.count().trunc() as u64, 10);
let res = hll.merge(&other);
assert_eq!(res, Ok(()));
assert_eq!(hll.count().trunc() as u64, 10);
assert!(!hll.is_sparse() && !other.is_sparse());
}
#[test]
fn test_merge_sparse_to_normal() {
let mut hll: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(16, PassThroughHasherBuilder {}).unwrap();
let mut other: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(16, PassThroughHasherBuilder {}).unwrap();
hll.sparse_to_normal();
other.insert(&0x00010fffffffffff);
other.insert(&0x00020fffffffffff);
other.insert(&0x00030fffffffffff);
other.insert(&0x00040fffffffffff);
other.insert(&0x00050fffffffffff);
other.insert(&0x00050fffffffffff);
assert_eq!(other.count().trunc() as u64, 5);
let res = hll.merge(&other);
assert_eq!(res, Ok(()));
assert_eq!(hll.count().trunc() as u64, 5);
assert!(!hll.is_sparse() && other.is_sparse());
let res = hll.merge(&other);
assert_eq!(res, Ok(()));
assert_eq!(hll.count().trunc() as u64, 5);
assert!(!hll.is_sparse() && other.is_sparse());
other.insert(&0x00060fffffffffff);
other.insert(&0x00070fffffffffff);
other.insert(&0x00080fffffffffff);
other.insert(&0x00090fffffffffff);
other.insert(&0x000a0fffffffffff);
assert_eq!(other.count().trunc() as u64, 10);
let res = hll.merge(&other);
assert_eq!(res, Ok(()));
assert_eq!(hll.count().trunc() as u64, 10);
assert!(!hll.is_sparse() && other.is_sparse());
}
#[test]
fn test_merge_normal_to_sparse() {
let mut hll: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(16, PassThroughHasherBuilder {}).unwrap();
let mut other: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(16, PassThroughHasherBuilder {}).unwrap();
other.sparse_to_normal();
other.insert(&0x00010fffffffffff);
other.insert(&0x00020fffffffffff);
other.insert(&0x00030fffffffffff);
other.insert(&0x00040fffffffffff);
other.insert(&0x00050fffffffffff);
other.insert(&0x00050fffffffffff);
assert_eq!(other.count().trunc() as u64, 5);
let res = hll.merge(&other);
assert_eq!(res, Ok(()));
assert_eq!(hll.count().trunc() as u64, 5);
assert!(!hll.is_sparse() && !other.is_sparse());
}
#[test]
fn test_serialization() {
let builder = PassThroughHasherBuilder {};
let mut hll: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(16, builder).unwrap();
hll.insert(&0x00010fffffffffff);
hll.insert(&0x00020fffffffffff);
hll.insert(&0x00030fffffffffff);
hll.insert(&0x00040fffffffffff);
hll.insert(&0x00050fffffffffff);
hll.insert(&0x00050fffffffffff);
assert_eq!(hll.count().trunc() as usize, 5);
let serialized = serde_json::to_string(&hll).unwrap();
let mut deserialized: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
serde_json::from_str(&serialized).unwrap();
assert_eq!(deserialized.count().trunc() as usize, 5);
deserialized.insert(&0x00060fffffffffff);
assert_eq!(deserialized.count().trunc() as usize, 6);
hll.sparse_to_normal();
assert_eq!(hll.count().trunc() as usize, 5);
let serialized = serde_json::to_string(&hll).unwrap();
let mut deserialized: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
serde_json::from_str(&serialized).unwrap();
assert_eq!(deserialized.count().trunc() as usize, 5);
deserialized.insert(&0x00060fffffffffff);
assert_eq!(deserialized.count().trunc() as usize, 6);
}
#[cfg(feature = "bench-units")]
mod benches {
extern crate test;
use super::*;
use rand::prelude::*;
use test::{black_box, Bencher};
#[bench]
fn bench_plus_insert_normal(b: &mut Bencher) {
let builder = PassThroughHasherBuilder {};
let mut hll: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(16, builder).unwrap();
hll.sparse_to_normal();
b.iter(|| {
for i in 0u64..1000 {
hll.insert(&(u64::max_value() - i));
}
})
}
#[bench]
fn bench_insert_normal_with_hash(b: &mut Bencher) {
let mut rng = rand::thread_rng();
let workload: Vec<String> = (0..2000)
.map(|_| {
format!("- {} - {} -", rng.gen::<u64>(), rng.gen::<u64>())
})
.collect();
b.iter(|| {
let mut hll: HyperLogLogPlus<&String, DefaultBuildHasher> =
HyperLogLogPlus::new(16, DefaultBuildHasher {}).unwrap();
hll.sparse_to_normal();
for val in &workload {
hll.insert(&val);
}
let val = hll.count();
black_box(val);
})
}
#[bench]
fn bench_plus_count_normal(b: &mut Bencher) {
let builder = PassThroughHasherBuilder {};
let mut hll: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(16, builder).unwrap();
hll.sparse_to_normal();
b.iter(|| {
let count = hll.count();
black_box(count);
})
}
#[bench]
fn bench_plus_merge_sparse(b: &mut Bencher) {
let builder = PassThroughHasherBuilder {};
let mut hll: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(16, builder).unwrap();
for i in 0u64..500 {
hll.insert(&(i << 39));
}
assert_eq!(hll.tmpset.len(), 500);
let set = hll.tmpset.clone();
b.iter(|| {
hll.tmpset = set.clone();
hll.merge_sparse()
});
assert!(hll.registers.is_none());
assert_eq!(hll.tmpset.len(), 0);
}
#[bench]
fn bench_estimate_bias(b: &mut Bencher) {
let builder = PassThroughHasherBuilder {};
let hll: HyperLogLogPlus<u64, PassThroughHasherBuilder> =
HyperLogLogPlus::new(18, builder).unwrap();
b.iter(|| {
let bias = hll.estimate_bias(275468.768);
black_box(bias);
let bias = hll.estimate_bias(587532.522);
black_box(bias);
let bias = hll.estimate_bias(1205430.993);
black_box(bias);
let bias = hll.estimate_bias(1251260.649);
black_box(bias);
});
}
}
}