use std::fmt;
const HASH_BITS: u32 = 64;
const HLL_P: u32 = 10;
pub const HLL_REGISTERS: usize = 1 << HLL_P;
const HLL_ALPHA: f64 = 0.7213 / (1.0 + 1.079 / (HLL_REGISTERS as f64));
const SMALL_RANGE_FACTOR: f64 = 2.5;
#[derive(Clone, PartialEq, Eq)]
pub struct HllSketch {
registers: Box<[u8]>,
}
impl Default for HllSketch {
fn default() -> Self {
Self::new()
}
}
impl fmt::Debug for HllSketch {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("HllSketch")
.field("estimate", &(self.estimate() as u64))
.finish()
}
}
impl HllSketch {
pub fn new() -> Self {
Self {
registers: vec![0u8; HLL_REGISTERS].into_boxed_slice(),
}
}
pub fn insert_hash(&mut self, hash: u64) {
let idx = (hash >> (HASH_BITS - HLL_P)) as usize;
let rest = hash << HLL_P;
let rank = (rest.leading_zeros().min(HASH_BITS - HLL_P) + 1) as u8;
if rank > self.registers[idx] {
self.registers[idx] = rank;
}
}
pub fn estimate(&self) -> f64 {
let m = HLL_REGISTERS as f64;
let sum: f64 = self
.registers
.iter()
.map(|&r| 2.0_f64.powi(-i32::from(r)))
.sum();
let raw = HLL_ALPHA * m * m / sum;
if raw <= SMALL_RANGE_FACTOR * m {
let zeros = self.registers.iter().filter(|&&r| r == 0).count();
if zeros > 0 {
return m * (m / zeros as f64).ln();
}
}
raw
}
pub fn merge(&mut self, other: &Self) {
for (a, b) in self.registers.iter_mut().zip(other.registers.iter()) {
if *b > *a {
*a = *b;
}
}
}
pub fn as_bytes(&self) -> &[u8] {
&self.registers
}
pub fn from_bytes(bytes: &[u8]) -> Option<Self> {
if bytes.len() != HLL_REGISTERS {
return None;
}
Some(Self {
registers: bytes.to_vec().into_boxed_slice(),
})
}
}
#[cfg(test)]
mod tests {
use xxhash_rust::xxh3::xxh3_64;
use super::*;
const TEST_DISTINCT: usize = 50_000;
const TEST_MAX_REL_ERR: f64 = 0.20;
#[test]
fn estimate_tracks_distinct_count() {
let mut sketch = HllSketch::new();
for i in 0..TEST_DISTINCT as u64 {
for _ in 0..2 {
sketch.insert_hash(xxh3_64(&i.to_le_bytes()));
}
}
let est = sketch.estimate();
let rel = (est - TEST_DISTINCT as f64).abs() / TEST_DISTINCT as f64;
assert!(
rel < TEST_MAX_REL_ERR,
"estimate {est} vs {TEST_DISTINCT} (rel err {rel:.3})"
);
}
#[test]
fn merge_is_union() {
let mut a = HllSketch::new();
let mut b = HllSketch::new();
let mut both = HllSketch::new();
for i in 0..10_000u64 {
let h = xxh3_64(&i.to_le_bytes());
if i % 2 == 0 {
a.insert_hash(h);
} else {
b.insert_hash(h);
}
both.insert_hash(h);
}
a.merge(&b);
assert_eq!(a, both, "merged sketch must equal the union sketch");
}
#[test]
fn bytes_round_trip() {
let mut sketch = HllSketch::new();
for i in 0..1000u64 {
sketch.insert_hash(xxh3_64(&i.to_le_bytes()));
}
let restored = HllSketch::from_bytes(sketch.as_bytes()).expect("valid length");
assert_eq!(restored, sketch);
assert!(HllSketch::from_bytes(&[0u8; 3]).is_none());
}
#[test]
fn default_debug_and_small_range_estimate() {
let sketch = HllSketch::default();
assert_eq!(
sketch,
HllSketch::new(),
"Default must equal an empty sketch"
);
assert_eq!(
sketch.estimate() as u64,
0,
"an all-zero sketch falls into the small-range branch and estimates 0"
);
let dbg = format!("{sketch:?}");
assert!(dbg.contains("HllSketch"), "Debug must name the type: {dbg}");
assert!(
dbg.contains("estimate"),
"Debug must show the estimate: {dbg}"
);
}
}