use alloc::vec;
use alloc::vec::Vec;
use core::hash::{Hash, Hasher};
use std::hash::DefaultHasher;
#[cfg(not(feature = "std"))]
#[allow(unused_imports)]
use num_traits::Float;
use crate::error::{RcfError, RcfResult};
pub const MIN_PRECISION: u8 = 4;
pub const MAX_PRECISION: u8 = 16;
pub const DEFAULT_PRECISION: u8 = 12;
#[derive(Clone, Debug)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
#[cfg_attr(feature = "serde", serde(try_from = "HyperLogLogShadow"))]
pub struct HyperLogLog {
precision: u8,
registers: Vec<u8>,
total_added: u64,
}
#[cfg(feature = "serde")]
#[derive(serde::Serialize, serde::Deserialize)]
#[allow(clippy::missing_docs_in_private_items)]
struct HyperLogLogShadow {
precision: u8,
registers: Vec<u8>,
total_added: u64,
}
#[cfg(feature = "serde")]
impl TryFrom<HyperLogLogShadow> for HyperLogLog {
type Error = RcfError;
fn try_from(raw: HyperLogLogShadow) -> Result<Self, Self::Error> {
if !(MIN_PRECISION..=MAX_PRECISION).contains(&raw.precision) {
return Err(RcfError::InvalidConfig(
alloc::format!(
"HyperLogLog: precision {} out of [{MIN_PRECISION}, {MAX_PRECISION}]",
raw.precision
)
.into(),
));
}
let expected = 1_usize << raw.precision;
if raw.registers.len() != expected {
return Err(RcfError::InvalidConfig(
alloc::format!(
"HyperLogLog: register bank length {} != expected {expected} for precision {}",
raw.registers.len(),
raw.precision
)
.into(),
));
}
Ok(Self {
precision: raw.precision,
registers: raw.registers,
total_added: raw.total_added,
})
}
}
impl HyperLogLog {
pub fn new(precision: u8) -> RcfResult<Self> {
if !(MIN_PRECISION..=MAX_PRECISION).contains(&precision) {
return Err(RcfError::InvalidConfig(
alloc::format!(
"HyperLogLog: precision {precision} out of [{MIN_PRECISION}, {MAX_PRECISION}]"
)
.into(),
));
}
let m = 1_usize << precision;
Ok(Self {
precision,
registers: vec![0_u8; m],
total_added: 0,
})
}
#[must_use]
pub fn with_default_precision() -> Self {
Self::new(DEFAULT_PRECISION).expect("DEFAULT_PRECISION is in range")
}
#[must_use]
pub fn register_count(&self) -> usize {
1_usize << self.precision
}
#[must_use]
pub fn precision(&self) -> u8 {
self.precision
}
#[must_use]
pub fn total_added(&self) -> u64 {
self.total_added
}
#[must_use]
pub fn memory_bytes(&self) -> usize {
self.registers.len()
}
pub fn add<T: Hash + ?Sized>(&mut self, value: &T) {
let mut h = DefaultHasher::new();
value.hash(&mut h);
self.add_hash(h.finish());
}
#[inline]
pub fn add_bytes(&mut self, key: &[u8]) {
let mut h = DefaultHasher::new();
key.hash(&mut h);
self.add_hash(h.finish());
}
#[allow(clippy::cast_possible_truncation)]
#[inline]
pub fn add_hash(&mut self, hash: u64) {
self.total_added = self.total_added.saturating_add(1);
let p = self.precision;
let idx = (hash >> (64 - p)) as usize;
let tail = hash << p;
let rho = if tail == 0 {
64 - p + 1
} else {
(tail.leading_zeros() as u8) + 1
};
let slot = &mut self.registers[idx];
if rho > *slot {
*slot = rho;
}
}
#[must_use]
#[allow(
clippy::cast_precision_loss,
clippy::cast_possible_truncation,
clippy::cast_sign_loss
)]
pub fn estimate(&self) -> u64 {
let m = self.register_count();
let m_f = m as f64;
let mut sum = 0.0_f64;
let mut zeros: usize = 0;
for &r in &self.registers {
if r == 0 {
zeros += 1;
}
sum += 2.0_f64.powi(-(i32::from(r)));
}
let alpha = alpha_m(m);
let raw = alpha * m_f * m_f / sum;
if raw <= 2.5 * m_f && zeros > 0 {
let v = zeros as f64;
return (m_f * (m_f / v).ln()).round().max(0.0) as u64;
}
raw.round().max(0.0) as u64
}
pub fn merge(&mut self, other: &Self) -> RcfResult<()> {
if self.precision != other.precision {
return Err(RcfError::InvalidConfig(
alloc::format!(
"HyperLogLog::merge: precision mismatch ({} vs {})",
self.precision,
other.precision
)
.into(),
));
}
for (slot, other_r) in self.registers.iter_mut().zip(other.registers.iter()) {
if *other_r > *slot {
*slot = *other_r;
}
}
self.total_added = self.total_added.saturating_add(other.total_added);
Ok(())
}
pub fn reset(&mut self) {
self.registers.iter_mut().for_each(|r| *r = 0);
self.total_added = 0;
}
}
#[must_use]
#[allow(clippy::cast_precision_loss)]
fn alpha_m(m: usize) -> f64 {
match m {
16 => 0.673,
32 => 0.697,
64 => 0.709,
_ => 0.7213 / (1.0 + 1.079 / m as f64),
}
}
#[cfg(test)]
#[allow(
clippy::cast_precision_loss,
clippy::cast_possible_truncation,
clippy::cast_sign_loss,
clippy::cast_possible_wrap,
clippy::items_after_statements,
clippy::manual_range_contains
)]
mod tests {
use super::*;
#[test]
fn new_rejects_out_of_range_precision() {
assert!(HyperLogLog::new(3).is_err());
assert!(HyperLogLog::new(17).is_err());
assert!(HyperLogLog::new(MIN_PRECISION).is_ok());
assert!(HyperLogLog::new(MAX_PRECISION).is_ok());
}
#[test]
fn empty_sketch_estimates_zero() {
let hll = HyperLogLog::with_default_precision();
assert_eq!(hll.estimate(), 0);
}
#[test]
fn exact_cardinality_at_tiny_scale() {
let mut hll = HyperLogLog::with_default_precision();
for i in 0..10_u32 {
hll.add(&i.to_le_bytes());
}
let est = hll.estimate();
assert!(est >= 9 && est <= 11, "est {est}");
}
#[test]
fn estimates_within_5pct_at_10k_distinct() {
let mut hll = HyperLogLog::with_default_precision();
for i in 0..10_000_u32 {
hll.add(&i.to_le_bytes());
}
let est = hll.estimate();
let err = (est as i64 - 10_000).unsigned_abs() as f64 / 10_000.0;
assert!(err < 0.05, "err {err:.4}, est {est}");
}
#[test]
fn estimates_within_3pct_at_100k_distinct_p14() {
let mut hll = HyperLogLog::new(14).expect("p=14 in range");
for i in 0..100_000_u32 {
hll.add(&i.to_le_bytes());
}
let est = hll.estimate();
let err = (est as i64 - 100_000).unsigned_abs() as f64 / 100_000.0;
assert!(err < 0.03, "err {err:.4}, est {est}");
}
#[test]
fn duplicate_inserts_do_not_inflate_estimate() {
let mut hll = HyperLogLog::with_default_precision();
for _ in 0..1_000 {
for i in 0..100_u32 {
hll.add(&i.to_le_bytes());
}
}
let est = hll.estimate();
assert!(est >= 90 && est <= 110, "est {est}");
assert_eq!(hll.total_added(), 100_000);
}
#[test]
fn merge_agrees_with_single_sketch() {
let mut a = HyperLogLog::with_default_precision();
let mut b = HyperLogLog::with_default_precision();
let mut full = HyperLogLog::with_default_precision();
for i in 0..5_000_u32 {
a.add(&i.to_le_bytes());
full.add(&i.to_le_bytes());
}
for i in 5_000..10_000_u32 {
b.add(&i.to_le_bytes());
full.add(&i.to_le_bytes());
}
a.merge(&b).expect("same precision");
let merged_est = a.estimate();
let full_est = full.estimate();
let delta = (merged_est as i64 - full_est as i64).unsigned_abs();
assert!(
delta < 200,
"delta {delta}, merged {merged_est}, full {full_est}"
);
}
#[test]
fn merge_rejects_precision_mismatch() {
let mut a = HyperLogLog::new(10).unwrap();
let b = HyperLogLog::new(12).unwrap();
assert!(a.merge(&b).is_err());
}
#[test]
fn reset_clears_all_registers_and_counter() {
let mut hll = HyperLogLog::with_default_precision();
for i in 0..1_000_u32 {
hll.add(&i.to_le_bytes());
}
assert!(hll.estimate() > 0);
hll.reset();
assert_eq!(hll.estimate(), 0);
assert_eq!(hll.total_added(), 0);
}
#[test]
fn add_hash_path_matches_add_bytes_path() {
let mut a = HyperLogLog::with_default_precision();
let mut b = HyperLogLog::with_default_precision();
for i in 0..1_000_u32 {
let bytes = i.to_le_bytes();
a.add_bytes(&bytes);
use core::hash::Hash;
use std::hash::DefaultHasher;
let mut h = DefaultHasher::new();
bytes.hash(&mut h);
b.add_hash(h.finish());
}
assert_eq!(a.estimate(), b.estimate());
}
#[test]
fn memory_bytes_matches_register_count() {
let hll = HyperLogLog::new(12).unwrap();
assert_eq!(hll.memory_bytes(), 4096);
let hll16 = HyperLogLog::new(16).unwrap();
assert_eq!(hll16.memory_bytes(), 65_536);
}
#[cfg(all(feature = "serde", feature = "postcard"))]
#[test]
fn postcard_roundtrip_preserves_estimate() {
let mut hll = HyperLogLog::with_default_precision();
for i in 0..5_000_u32 {
hll.add(&i.to_le_bytes());
}
let before = hll.estimate();
let bytes = postcard::to_allocvec(&hll).expect("serde ok");
let back: HyperLogLog = postcard::from_bytes(&bytes).expect("serde ok");
assert_eq!(back.estimate(), before);
assert_eq!(back.total_added(), hll.total_added());
}
#[cfg(all(feature = "serde", feature = "postcard"))]
#[test]
fn deserialize_rejects_out_of_range_precision() {
let bad = HyperLogLogShadow {
precision: MAX_PRECISION + 1,
registers: alloc::vec![0_u8; 1 << MAX_PRECISION],
total_added: 0,
};
let bytes = postcard::to_allocvec(&bad).unwrap();
let back: Result<HyperLogLog, _> = postcard::from_bytes(&bytes);
assert!(back.is_err());
}
#[cfg(all(feature = "serde", feature = "postcard"))]
#[test]
fn deserialize_rejects_register_length_mismatch() {
let bad = HyperLogLogShadow {
precision: DEFAULT_PRECISION,
registers: alloc::vec![0_u8; 10], total_added: 0,
};
let bytes = postcard::to_allocvec(&bad).unwrap();
let back: Result<HyperLogLog, _> = postcard::from_bytes(&bytes);
assert!(back.is_err());
}
}