use crate::{HyperLogLog, alpha_m, fnv1a64};
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct SparseHyperLogLog {
p: u32,
m: u32,
alpha: f64,
sparse: Option<Vec<(u32, u8)>>,
dense: Option<HyperLogLog>,
threshold: usize,
}
impl SparseHyperLogLog {
pub fn new(precision: u32) -> Self {
let p = precision.clamp(4, 18);
let m = 1u32 << p;
let threshold = (m / 4) as usize;
Self::with_threshold(p, threshold)
}
pub fn with_threshold(precision: u32, threshold: usize) -> Self {
let p = precision.clamp(4, 18);
let m = 1u32 << p;
let alpha = alpha_m(m);
Self {
p,
m,
alpha,
sparse: Some(Vec::new()),
dense: None,
threshold,
}
}
pub fn precision(&self) -> u32 {
self.p
}
pub fn register_count(&self) -> u32 {
self.m
}
pub fn is_sparse(&self) -> bool {
self.sparse.is_some()
}
pub fn entry_count(&self) -> usize {
if let Some(list) = &self.sparse {
list.len()
} else if let Some(d) = &self.dense {
d.registers().iter().filter(|&&r| r != 0).count()
} else {
0
}
}
pub fn add(&mut self, key: &str) {
let h = fnv1a64(key.as_bytes());
let idx = (h >> (64 - self.p)) as u32;
let w = (h << self.p) | (1u64 << (self.p - 1));
let r = (w.leading_zeros() + 1) as u8;
if let Some(list) = self.sparse.as_mut() {
if let Some(pos) = list.iter().position(|(i, _)| *i == idx) {
if r > list[pos].1 {
list[pos].1 = r;
}
} else {
list.push((idx, r));
if list.len() >= self.threshold {
self.promote();
}
}
} else if let Some(d) = self.dense.as_mut() {
d.add(key);
}
}
pub fn estimate(&self) -> f64 {
if let Some(list) = &self.sparse {
let m = self.m as f64;
let held_sum: f64 = list.iter().map(|(_, r)| 2f64.powi(-(*r as i32))).sum();
let zero_count = self.m as usize - list.len();
let sum = held_sum + zero_count as f64;
let raw = self.alpha * m * m / sum;
if zero_count > 0 && raw <= 2.5 * m {
-m * (zero_count as f64 / m).ln()
} else {
raw
}
} else if let Some(d) = &self.dense {
d.estimate()
} else {
0.0
}
}
pub fn promote(&mut self) {
if self.dense.is_some() {
return;
}
let list = self.sparse.take().unwrap_or_default();
let mut dense = HyperLogLog::new(self.p);
dense.apply_sparse(&list);
self.dense = Some(dense);
}
pub fn as_dense(&self) -> Option<&HyperLogLog> {
self.dense.as_ref()
}
}
impl HyperLogLog {
pub(crate) fn apply_sparse(&mut self, list: &[(u32, u8)]) {
for &(idx, r) in list {
let i = idx as usize;
if i < self.registers.len() && r > self.registers[i] {
self.registers[i] = r;
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn starts_sparse_and_grows_linearly() {
let mut h = SparseHyperLogLog::new(10);
assert!(h.is_sparse());
for i in 0..20 {
h.add(&format!("k{i}"));
}
assert!(h.is_sparse(), "still sparse below threshold");
assert!(h.entry_count() > 0);
}
#[test]
fn promotes_to_dense_at_threshold() {
let mut h = SparseHyperLogLog::new(8);
for i in 0..500 {
h.add(&format!("k{i}"));
}
assert!(!h.is_sparse(), "promoted past threshold");
assert!(h.as_dense().is_some());
}
#[test]
fn estimate_matches_dense_after_promotion() {
let mut sparse = SparseHyperLogLog::new(10);
let mut dense = HyperLogLog::new(10);
for i in 0..2_000 {
let k = format!("user-{i}");
sparse.add(&k);
dense.add(&k);
}
let s = sparse.estimate();
let d = dense.estimate();
let rel = ((s - d).abs() / d.max(1.0)).abs();
assert!(
rel < 0.05,
"post-promotion match within 5%: sparse={s} dense={d}"
);
}
#[test]
fn low_cardinality_accurate_via_linear_counting() {
let mut h = SparseHyperLogLog::new(10);
for i in 0..50 {
h.add(&format!("k{i}"));
}
let est = h.estimate();
assert!(
est > 40.0 && est < 60.0,
"low-card linear counting: got {est}"
);
}
#[test]
fn force_promote_idempotent() {
let mut h = SparseHyperLogLog::new(8);
h.add("a");
h.promote();
assert!(!h.is_sparse());
h.promote();
assert!(!h.is_sparse());
}
#[test]
fn duplicate_keys_dont_inflate_entry_count() {
let mut h = SparseHyperLogLog::new(10);
for _ in 0..1_000 {
h.add("same-key");
}
assert!(h.is_sparse());
assert_eq!(
h.entry_count(),
1,
"one register touched, regardless of insert count"
);
}
}