use crate::error::{CoreError, CoreResult, ErrorContext};
use crate::error_context;
use std::hash::{BuildHasher, Hasher};
const MIN_PRECISION: u8 = 4;
const MAX_PRECISION: u8 = 18;
const SPARSE_THRESHOLD_FRACTION: f64 = 0.75;
#[derive(Clone)]
pub struct HyperLogLog {
precision: u8,
num_registers: usize,
repr: HllRepr,
hash_builder: std::collections::hash_map::RandomState,
}
#[derive(Clone)]
enum HllRepr {
Sparse {
entries: Vec<(u32, u8)>,
},
Dense {
registers: Vec<u8>,
},
}
impl HyperLogLog {
pub fn new(precision: u8) -> CoreResult<Self> {
if precision < MIN_PRECISION || precision > MAX_PRECISION {
return Err(CoreError::InvalidArgument(error_context!(format!(
"precision must be in [{MIN_PRECISION}, {MAX_PRECISION}], got {precision}"
))));
}
let num_registers = 1usize << precision;
Ok(Self {
precision,
num_registers,
repr: HllRepr::Sparse {
entries: Vec::new(),
},
hash_builder: std::collections::hash_map::RandomState::new(),
})
}
pub fn insert(&mut self, item: &[u8]) {
let hash = self.hash_item(item);
let (register_idx, rho) = self.decode_hash(hash);
match &mut self.repr {
HllRepr::Sparse { entries } => {
match entries.binary_search_by_key(®ister_idx, |&(idx, _)| idx) {
Ok(pos) => {
if rho > entries[pos].1 {
entries[pos].1 = rho;
}
}
Err(pos) => {
entries.insert(pos, (register_idx, rho));
}
}
let threshold =
(self.num_registers as f64 * SPARSE_THRESHOLD_FRACTION) as usize;
if entries.len() > threshold {
self.switch_to_dense();
}
}
HllRepr::Dense { registers } => {
if rho > registers[register_idx as usize] {
registers[register_idx as usize] = rho;
}
}
}
}
pub fn count(&self) -> f64 {
match &self.repr {
HllRepr::Sparse { entries } => self.count_sparse(entries),
HllRepr::Dense { registers } => self.count_dense(registers),
}
}
pub fn merge(&mut self, other: &HyperLogLog) -> CoreResult<()> {
if self.precision != other.precision {
return Err(CoreError::DimensionError(error_context!(
"HyperLogLog precision must match for merge"
)));
}
self.switch_to_dense();
let other_registers = match &other.repr {
HllRepr::Dense { registers } => registers.clone(),
HllRepr::Sparse { entries } => {
let mut regs = vec![0u8; self.num_registers];
for &(idx, rho) in entries {
if rho > regs[idx as usize] {
regs[idx as usize] = rho;
}
}
regs
}
};
if let HllRepr::Dense { registers } = &mut self.repr {
for (r, &other_r) in registers.iter_mut().zip(other_registers.iter()) {
if other_r > *r {
*r = other_r;
}
}
}
Ok(())
}
pub fn precision(&self) -> u8 {
self.precision
}
pub fn num_registers(&self) -> usize {
self.num_registers
}
pub fn standard_error(&self) -> f64 {
1.04 / (self.num_registers as f64).sqrt()
}
pub fn is_sparse(&self) -> bool {
matches!(&self.repr, HllRepr::Sparse { .. })
}
pub fn clear(&mut self) {
self.repr = HllRepr::Sparse {
entries: Vec::new(),
};
}
fn hash_item(&self, item: &[u8]) -> u64 {
let mut hasher = self.hash_builder.build_hasher();
hasher.write(item);
hasher.finish()
}
fn decode_hash(&self, hash: u64) -> (u32, u8) {
let p = self.precision;
let register_idx = (hash >> (64 - p)) as u32;
let remaining = hash << p;
let rho = if remaining == 0 {
(64 - p) as u8 + 1
} else {
remaining.leading_zeros() as u8 + 1
};
(register_idx, rho)
}
fn switch_to_dense(&mut self) {
if let HllRepr::Sparse { entries } = &self.repr {
let mut registers = vec![0u8; self.num_registers];
for &(idx, rho) in entries {
if rho > registers[idx as usize] {
registers[idx as usize] = rho;
}
}
self.repr = HllRepr::Dense { registers };
}
}
fn count_sparse(&self, entries: &[(u32, u8)]) -> f64 {
if entries.is_empty() {
return 0.0;
}
let m = self.num_registers;
let mut registers = vec![0u8; m];
for &(idx, rho) in entries {
if rho > registers[idx as usize] {
registers[idx as usize] = rho;
}
}
self.count_dense(®isters)
}
fn count_dense(&self, registers: &[u8]) -> f64 {
let m = self.num_registers as f64;
let mut sum = 0.0f64;
let mut zeros = 0usize;
for &val in registers {
sum += 2.0f64.powi(-(val as i32));
if val == 0 {
zeros += 1;
}
}
let alpha = self.alpha_m();
let raw_estimate = alpha * m * m / sum;
if raw_estimate <= 2.5 * m && zeros > 0 {
let linear_count = -m * (zeros as f64 / m).ln();
return linear_count;
}
raw_estimate
}
fn alpha_m(&self) -> f64 {
let m = self.num_registers as f64;
match self.num_registers {
16 => 0.673,
32 => 0.697,
64 => 0.709,
_ => 0.7213 / (1.0 + 1.079 / m),
}
}
}
impl std::fmt::Debug for HyperLogLog {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let mode = if self.is_sparse() { "sparse" } else { "dense" };
f.debug_struct("HyperLogLog")
.field("precision", &self.precision)
.field("num_registers", &self.num_registers)
.field("mode", &mode)
.field("standard_error", &format!("{:.2}%", self.standard_error() * 100.0))
.finish()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_hll_empty_count_zero() {
let hll = HyperLogLog::new(12).expect("valid");
assert_eq!(hll.count(), 0.0);
}
#[test]
fn test_hll_single_element() {
let mut hll = HyperLogLog::new(12).expect("valid");
hll.insert(b"single_item");
let count = hll.count();
assert!(count >= 0.5, "Count too low for single element: {count}");
assert!(count < 5.0, "Count too high for single element: {count}");
}
#[test]
fn test_hll_cardinality_estimate_accuracy() {
let mut hll = HyperLogLog::new(14).expect("valid");
let n = 100_000u64;
for i in 0..n {
hll.insert(&i.to_le_bytes());
}
let estimate = hll.count();
let error = (estimate - n as f64).abs() / n as f64;
assert!(
error < 0.05,
"Estimate {estimate} is {:.2}% off from true {n}",
error * 100.0
);
}
#[test]
fn test_hll_merge_correctness() {
let mut hll1 = HyperLogLog::new(12).expect("valid");
let mut hll2 = HyperLogLog::new(12).expect("valid");
for i in 0..5000u64 {
hll1.insert(&i.to_le_bytes());
}
for i in 5000..10000u64 {
hll2.insert(&i.to_le_bytes());
}
let est1 = hll1.count();
let est2 = hll2.count();
hll1.merge(&hll2).expect("same precision");
let merged_est = hll1.count();
let error = (merged_est - 10_000.0).abs() / 10_000.0;
assert!(
error < 0.10,
"Merged estimate {merged_est} is {:.2}% off (est1={est1:.0}, est2={est2:.0})",
error * 100.0
);
}
#[test]
fn test_hll_duplicate_inserts() {
let mut hll = HyperLogLog::new(12).expect("valid");
for _ in 0..10_000 {
hll.insert(b"same_item");
}
let estimate = hll.count();
assert!(
estimate < 5.0,
"Duplicate inserts gave estimate {estimate}, expected ~1"
);
}
#[test]
fn test_hll_standard_error() {
let hll = HyperLogLog::new(12).expect("valid");
let se = hll.standard_error();
assert!((se - 0.01625).abs() < 0.001, "Unexpected standard error: {se}");
}
#[test]
fn test_hll_sparse_to_dense_transition() {
let mut hll = HyperLogLog::new(8).expect("valid"); assert!(hll.is_sparse());
for i in 0..1000u64 {
hll.insert(&i.to_le_bytes());
}
assert!(!hll.is_sparse(), "Expected dense mode after many inserts");
}
#[test]
fn test_hll_invalid_precision() {
assert!(HyperLogLog::new(3).is_err());
assert!(HyperLogLog::new(19).is_err());
assert!(HyperLogLog::new(0).is_err());
}
#[test]
fn test_hll_clear() {
let mut hll = HyperLogLog::new(10).expect("valid");
for i in 0..500u64 {
hll.insert(&i.to_le_bytes());
}
assert!(hll.count() > 100.0);
hll.clear();
assert_eq!(hll.count(), 0.0);
assert!(hll.is_sparse());
}
#[test]
fn test_hll_merge_incompatible() {
let mut hll1 = HyperLogLog::new(10).expect("valid");
let hll2 = HyperLogLog::new(12).expect("valid");
assert!(hll1.merge(&hll2).is_err());
}
#[test]
fn test_hll_various_precisions() {
for p in MIN_PRECISION..=MAX_PRECISION {
let mut hll = HyperLogLog::new(p).expect("valid precision");
assert_eq!(hll.num_registers(), 1 << p);
hll.insert(b"test");
assert!(hll.count() > 0.0);
}
}
#[test]
fn test_hll_large_cardinality() {
let mut hll = HyperLogLog::new(14).expect("valid");
let n = 1_000_000u64;
for i in 0..n {
hll.insert(&i.to_le_bytes());
}
let estimate = hll.count();
let error = (estimate - n as f64).abs() / n as f64;
assert!(
error < 0.03,
"Large cardinality estimate {estimate:.0} is {:.2}% off from {n}",
error * 100.0
);
}
}