exaloglog 0.15.0

//! `ExaLogLog`: packed 28-bit ExaLogLog (`ELL(t=2, d=20)`).
//!
//! The default and most space-efficient configuration. Each register
//! occupies 28 bits (`q = 8`, `d = 20`), packed as two registers per 7
//! bytes. MVP = 3.67, **43% smaller than HLL** with 6-bit registers at
//! the same estimation error.
//!
//! Per-register read costs an unaligned 4-byte load and a mask; per-
//! register write costs the same plus a read-modify-write of the byte
//! shared with the sibling register. Concurrent updates require external
//! synchronization because the shared boundary byte cannot be CAS'd
//! safely. For lock-free concurrent updates use [`crate::ExaLogLogFast`].
//!
//! # Sparse mode
//!
//! For low cardinalities, an `ExaLogLog` keeps a sorted list of 32-bit
//! hash tokens (paper §4.3) instead of allocating the dense register
//! array. Memory grows linearly with the number of distinct elements
//! until the break-even point — `m · 7/8` distinct tokens, the count at
//! which the sparse Vec equals the dense register array in size — where
//! the sketch automatically promotes to the dense representation. After
//! that the storage cost is fixed.
//!
//! Effects on the public API in sparse mode:
//!
//! - [`Self::estimate`] / [`Self::estimate_ml`] use the token-based ML
//!   estimator from Algorithm 7 of the paper. For very small `n` the
//!   sketch is *exact*: `estimate_ml` recovers the true distinct count.
//! - The martingale (HIP) estimator is not maintained in sparse mode;
//!   [`Self::estimate_martingale`] returns `None` until the sketch
//!   transitions to dense (which marks martingale permanently invalid
//!   anyway, since the per-insert state can't be reconstructed from
//!   collected tokens).
//!
//! # Custom hashers
//!
//! `add(&T)` uses `DefaultHasher` (SipHash13). For high-throughput
//! ingestion, use a faster hash and call [`Self::add_hash`].

use std::hash::{DefaultHasher, Hash, Hasher};

use crate::math;
use crate::{DeserializeError, FORMAT_VERSION, MAGIC, MergeError};
use crate::{MAX_P, MIN_P, T};

const D: u32 = 20;
const REGISTER_MASK: u32 = (1u32 << 28) - 1;
const HEADER_LEN: usize = 8;
const BYTES_PER_PAIR: usize = 7;

/// Format-version flag bit indicating sparse-mode payload in `to_bytes`.
const FORMAT_FLAG_SPARSE: u8 = 0x80;

/// Packed 28-bit ExaLogLog with automatic sparse↔dense storage. See
/// module docs.
#[derive(Clone, Debug)]
pub struct ExaLogLog {
    p: u32,
    storage: Storage,
    martingale: f64,
    mu: f64,
    martingale_invalid: bool,
}

#[derive(Clone, Debug)]
enum Storage {
    /// Sorted, deduplicated list of 32-bit hash tokens.
    Sparse(Vec<u32>),
    /// Packed dense register array, `m / 2` chunks of 7 bytes.
    Dense(Box<[u8]>),
}

/// Number of distinct tokens at which sparse memory equals dense memory
/// (`m · 7/8`).
fn sparse_capacity(p: u32) -> usize {
    ((1usize << p) * 7) / 8
}

/// Maximum precision at which sparse mode is safe. Tokens are
/// `V + 6 = 32` bits and reconstruct the original hash faithfully only
/// when `p + t ≤ V`, i.e. `p ≤ V − T = 24`. For `p > 24` we skip sparse
/// mode and start dense.
const MAX_P_SPARSE: u32 = math::SPARSE_V - T;

fn dense_storage_bytes(p: u32) -> usize {
    ((1usize << p) / 2) * BYTES_PER_PAIR
}

impl ExaLogLog {
    /// Create an empty sketch with `2^p` registers' worth of capacity.
    /// The sketch starts in sparse mode (a sorted list of 32-bit hash
    /// tokens) for `p ≤ 24` and auto-promotes to dense at the per-`p`
    /// break-even point. For `p > 24` the 32-bit token format would be
    /// lossy (`p + t > V`), so the sketch starts dense from the outset.
    pub fn new(p: u32) -> Self {
        assert!(
            (MIN_P..=MAX_P).contains(&p),
            "precision p={p} out of range [{MIN_P}, {MAX_P}]"
        );
        if p > MAX_P_SPARSE {
            return Self::new_dense(p);
        }
        Self {
            p,
            storage: Storage::Sparse(Vec::new()),
            martingale: 0.0,
            mu: 1.0,
            martingale_invalid: true, // not maintained in sparse mode
        }
    }

    /// Create a sketch sized so the theoretical RMSE at the eventual
    /// cardinality is at most `target_rmse` (e.g., `0.02` for 2%).
    /// Picks the smallest precision `p` that satisfies
    /// `√(MVP / ((q + d) · 2^p)) ≤ target_rmse`, with `MVP = 3.67`,
    /// `q + d = 28`. Clamped to `[MIN_P, MAX_P]`.
    ///
    /// Panics if `target_rmse` is non-finite or non-positive.
    ///
    /// ```
    /// use exaloglog::ExaLogLog;
    /// let s = ExaLogLog::with_target_rmse(0.02);  // ~2% target
    /// assert!(s.precision() >= 7);                 // m=128, RMSE ≈ 1.0%
    /// ```
    pub fn with_target_rmse(target_rmse: f64) -> Self {
        assert!(
            target_rmse.is_finite() && target_rmse > 0.0,
            "with_target_rmse: target must be finite and positive, got {target_rmse}"
        );
        const MVP: f64 = 3.67;
        const BITS_PER_REGISTER: f64 = 28.0;
        let m_needed = MVP / (BITS_PER_REGISTER * target_rmse * target_rmse);
        let p = (m_needed.log2().ceil() as i32).clamp(MIN_P as i32, MAX_P as i32) as u32;
        Self::new(p)
    }

    /// Create an empty sketch directly in dense mode (skips sparse).
    /// Useful when you know the cardinality will exceed the break-even
    /// point so the sparse-mode allocations are wasted.
    pub fn new_dense(p: u32) -> Self {
        assert!(
            (MIN_P..=MAX_P).contains(&p),
            "precision p={p} out of range [{MIN_P}, {MAX_P}]"
        );
        Self {
            p,
            storage: Storage::Dense(vec![0u8; dense_storage_bytes(p)].into_boxed_slice()),
            martingale: 0.0,
            mu: 1.0,
            martingale_invalid: false,
        }
    }

    /// Precision parameter.
    #[must_use]
    pub fn precision(&self) -> u32 {
        self.p
    }

    /// Number of registers (`2^p`).
    #[must_use]
    pub fn num_registers(&self) -> usize {
        1 << self.p
    }

    /// In-memory size of the storage in bytes.
    #[must_use]
    pub fn register_bytes(&self) -> usize {
        match &self.storage {
            Storage::Sparse(tokens) => tokens.capacity() * 4,
            Storage::Dense(b) => b.len(),
        }
    }

    /// Returns `true` if the sketch is currently in sparse mode.
    #[must_use]
    pub fn is_sparse(&self) -> bool {
        matches!(self.storage, Storage::Sparse(_))
    }

    /// `d` parameter (20).
    #[must_use]
    pub fn d_parameter() -> u32 {
        D
    }

    /// Read register `i`. Panics if `i ≥ self.num_registers()` (in
    /// release builds the bounds check happens at the dense array
    /// access; in sparse mode the panic is explicit so the modes
    /// behave identically).
    ///
    /// In sparse mode the dense array is materialized implicitly: we
    /// walk every token and apply Algorithm 2's rule on the fly. For
    /// repeated access prefer calling [`Self::densify`] once.
    #[inline]
    #[must_use]
    pub fn get_register(&self, i: usize) -> u32 {
        assert!(
            i < self.num_registers(),
            "register index {i} out of range [0, {})",
            self.num_registers()
        );
        match &self.storage {
            Storage::Sparse(tokens) => {
                let mut r = 0u32;
                for &tok in tokens {
                    let h = math::token_to_hash(tok);
                    let (idx, k) = math::hash_to_register_k(h, self.p);
                    if idx == i {
                        r = math::apply_insert(r, k, D);
                    }
                }
                r
            }
            Storage::Dense(_) => self.read_dense_register(i),
        }
    }

    fn read_dense_register(&self, i: usize) -> u32 {
        let storage = match &self.storage {
            Storage::Dense(b) => b,
            Storage::Sparse(_) => unreachable!("read_dense_register called in sparse mode"),
        };
        let chunk_off = (i >> 1) * BYTES_PER_PAIR;
        if i & 1 == 0 {
            let bytes = [
                storage[chunk_off],
                storage[chunk_off + 1],
                storage[chunk_off + 2],
                storage[chunk_off + 3],
            ];
            u32::from_le_bytes(bytes) & REGISTER_MASK
        } else {
            let bytes = [
                storage[chunk_off + 3],
                storage[chunk_off + 4],
                storage[chunk_off + 5],
                storage[chunk_off + 6],
            ];
            u32::from_le_bytes(bytes) >> 4
        }
    }

    fn write_dense_register(&mut self, i: usize, value: u32) {
        debug_assert!(i < self.num_registers());
        debug_assert!(
            value <= REGISTER_MASK,
            "register value {value:#x} exceeds 28 bits"
        );
        let storage = match &mut self.storage {
            Storage::Dense(b) => b,
            Storage::Sparse(_) => unreachable!("write_dense_register called in sparse mode"),
        };
        let chunk_off = (i >> 1) * BYTES_PER_PAIR;
        let v = value & REGISTER_MASK;
        if i & 1 == 0 {
            storage[chunk_off] = v as u8;
            storage[chunk_off + 1] = (v >> 8) as u8;
            storage[chunk_off + 2] = (v >> 16) as u8;
            let high4 = storage[chunk_off + 3] & 0xF0;
            storage[chunk_off + 3] = high4 | ((v >> 24) as u8 & 0x0F);
        } else {
            let low4 = storage[chunk_off + 3] & 0x0F;
            storage[chunk_off + 3] = low4 | ((v << 4) as u8 & 0xF0);
            storage[chunk_off + 4] = (v >> 4) as u8;
            storage[chunk_off + 5] = (v >> 12) as u8;
            storage[chunk_off + 6] = (v >> 20) as u8;
        }
    }

    /// Force promotion to dense storage if currently sparse. After this
    /// call subsequent `get_register` calls are O(1). Idempotent.
    pub fn densify(&mut self) {
        if matches!(self.storage, Storage::Dense(_)) {
            return;
        }
        let tokens = match std::mem::replace(&mut self.storage, Storage::Sparse(Vec::new())) {
            Storage::Sparse(t) => t,
            Storage::Dense(_) => unreachable!(),
        };
        self.storage = Storage::Dense(vec![0u8; dense_storage_bytes(self.p)].into_boxed_slice());
        for tok in tokens {
            let h = math::token_to_hash(tok);
            let (i, k) = math::hash_to_register_k(h, self.p);
            let r = self.read_dense_register(i);
            let new_r = math::apply_insert(r, k, D);
            if new_r != r {
                self.write_dense_register(i, new_r);
            }
        }
        // Martingale state cannot be reconstructed from sparse tokens.
        self.martingale_invalid = true;
        self.martingale = f64::NAN;
        self.mu = f64::NAN;
    }

    /// Insert a 64-bit hash value (Algorithm 2).
    pub fn add_hash(&mut self, hash: u64) {
        match &mut self.storage {
            Storage::Sparse(tokens) => {
                let token = math::hash_to_token(hash);
                if let Err(idx) = tokens.binary_search(&token) {
                    tokens.insert(idx, token);
                    if tokens.len() > sparse_capacity(self.p) {
                        self.densify();
                    }
                }
            }
            Storage::Dense(_) => {
                let (i, k) = math::hash_to_register_k(hash, self.p);
                let r = self.read_dense_register(i);
                let new_r = math::apply_insert(r, k, D);
                if new_r != r {
                    if !self.martingale_invalid {
                        self.martingale += 1.0 / self.mu;
                        self.mu -= math::h(r, self.p, D) - math::h(new_r, self.p, D);
                        if self.mu < 1e-300 {
                            self.mu = 1e-300;
                        }
                    }
                    self.write_dense_register(i, new_r);
                }
            }
        }
    }

    /// Insert a batch of pre-computed 64-bit hashes. In sparse mode this
    /// appends all tokens at once and sorts/dedupes in `O(N log N)`,
    /// promoting to dense if the result crosses the break-even
    /// threshold. In dense mode it's a tight loop over `add_hash` —
    /// modern compilers auto-vectorize the bit-twiddling work.
    pub fn add_hashes(&mut self, hashes: &[u64]) {
        match &mut self.storage {
            Storage::Sparse(tokens) => {
                tokens.reserve(hashes.len());
                for &h in hashes {
                    tokens.push(math::hash_to_token(h));
                }
                tokens.sort_unstable();
                tokens.dedup();
                if tokens.len() > sparse_capacity(self.p) {
                    self.densify();
                }
            }
            Storage::Dense(_) => {
                for &h in hashes {
                    self.add_hash(h);
                }
            }
        }
    }

    /// Insert a batch of hashes, optimizing for cache locality on large
    /// batches. Computes all `(i, k)` pairs up front, sorts them by
    /// register index, and applies all updates to register `i` together
    /// before moving on — cuts register R/W traffic from `O(N)` to
    /// `O(distinct register indices)` and turns a random-access pattern
    /// into a sequential one.
    ///
    /// Worth using when the register array doesn't fit in L1 cache
    /// (`p ≥ 14` on typical x86_64) and `hashes.len()` is large enough
    /// to amortize the sort cost — empirically about **89% faster than
    /// the scalar loop at `p = 16`** thanks to counting-sort by
    /// register index (`O(N + m)` instead of `O(N log N)`). At smaller
    /// `p` the simple loop wins because the register array fits in L1;
    /// reach for [`Self::add_hashes`] then. Allocates roughly
    /// `8 · hashes.len() + 8 · m` extra bytes for the sort buffer.
    ///
    /// Always operates in dense mode; promotes from sparse if needed,
    /// which invalidates the martingale estimator.
    pub fn add_hashes_sorted(&mut self, hashes: &[u64]) {
        if hashes.is_empty() {
            return;
        }
        self.densify();
        let p = self.p;
        let mut iks: Vec<(u32, u32)> = Vec::with_capacity(hashes.len());
        #[cfg(all(target_arch = "x86_64", feature = "simd"))]
        crate::simd_x86::fill_iks(hashes, p, &mut iks);
        #[cfg(not(all(target_arch = "x86_64", feature = "simd")))]
        math::fill_iks(hashes, p, &mut iks);
        math::counting_sort_by_register(&mut iks, 1usize << p);
        let mut idx = 0;
        while idx < iks.len() {
            let i = iks[idx].0 as usize;
            let r_start = self.read_dense_register(i);
            let mut r = r_start;
            while idx < iks.len() && iks[idx].0 as usize == i {
                r = math::apply_insert(r, iks[idx].1, D);
                idx += 1;
            }
            if r != r_start {
                self.write_dense_register(i, r);
            }
        }
        self.martingale_invalid = true;
    }

    /// Insert any hashable value, using the standard library default hasher.
    /// For high-throughput workloads, prefer [`Self::add_hash`] with a
    /// faster hash function (xxhash3, wyhash, etc.).
    pub fn add<H: Hash + ?Sized>(&mut self, item: &H) {
        let mut hasher = DefaultHasher::new();
        item.hash(&mut hasher);
        self.add_hash(hasher.finish());
    }

    /// Best available cardinality estimate. In sparse mode this is the
    /// token-based ML estimator (Algorithm 7); in dense mode it's the
    /// register-based ML estimator. Either way it works from the persistent
    /// state alone, so it is valid after merges and deserialization.
    #[must_use]
    pub fn estimate(&self) -> f64 {
        self.estimate_ml()
    }

    /// Maximum-likelihood estimate.
    #[must_use]
    pub fn estimate_ml(&self) -> f64 {
        match &self.storage {
            Storage::Sparse(tokens) => math::estimate_from_tokens(tokens),
            Storage::Dense(_) => {
                let regs = (0..self.num_registers()).map(|i| self.read_dense_register(i));
                let (alpha, beta) = math::compute_alpha_beta(regs, self.p, D);
                math::solve_ml(alpha, &beta, self.p)
            }
        }
    }

    /// Martingale (HIP) estimate, if the running state is still valid.
    /// Returns `None` in sparse mode and after any merge or deserialization.
    #[must_use]
    pub fn estimate_martingale(&self) -> Option<f64> {
        if self.martingale_invalid {
            None
        } else {
            Some(self.martingale)
        }
    }

    /// Merge another sketch into `self`. Both sketches must share the
    /// same precision `p`. Mode-handling: if both sides are sparse, take
    /// the union of token sets; otherwise densify both and merge per
    /// Algorithm 5.
    pub fn merge(&mut self, other: &Self) -> Result<(), MergeError> {
        if self.p != other.p {
            return Err(MergeError::PrecisionMismatch {
                lhs: self.p,
                rhs: other.p,
            });
        }
        match (&mut self.storage, &other.storage) {
            (Storage::Sparse(self_tokens), Storage::Sparse(other_tokens)) => {
                // Union of two sorted, distinct lists.
                let mut merged = Vec::with_capacity(self_tokens.len() + other_tokens.len());
                let mut a = self_tokens.iter().copied().peekable();
                let mut b = other_tokens.iter().copied().peekable();
                loop {
                    match (a.peek().copied(), b.peek().copied()) {
                        (Some(x), Some(y)) if x < y => {
                            merged.push(x);
                            a.next();
                        }
                        (Some(x), Some(y)) if x > y => {
                            merged.push(y);
                            b.next();
                        }
                        (Some(x), Some(_)) => {
                            merged.push(x);
                            a.next();
                            b.next();
                        }
                        (Some(x), None) => {
                            merged.push(x);
                            a.next();
                        }
                        (None, Some(y)) => {
                            merged.push(y);
                            b.next();
                        }
                        (None, None) => break,
                    }
                }
                if merged.len() > sparse_capacity(self.p) {
                    self.storage = Storage::Sparse(merged);
                    self.densify();
                } else {
                    self.storage = Storage::Sparse(merged);
                }
            }
            _ => {
                // At least one is dense; densify self if needed and walk
                // other's contents into it via the dense merge rule.
                self.densify();
                match &other.storage {
                    Storage::Sparse(other_tokens) => {
                        for &tok in other_tokens {
                            let h = math::token_to_hash(tok);
                            let (i, k) = math::hash_to_register_k(h, self.p);
                            let r = self.read_dense_register(i);
                            let new_r = math::apply_insert(r, k, D);
                            if new_r != r {
                                self.write_dense_register(i, new_r);
                            }
                        }
                    }
                    Storage::Dense(_) => {
                        for i in 0..self.num_registers() {
                            let merged = math::merge_register(
                                self.read_dense_register(i),
                                other.read_dense_register(i),
                                D,
                            );
                            if merged != self.read_dense_register(i) {
                                self.write_dense_register(i, merged);
                            }
                        }
                    }
                }
                self.martingale_invalid = true;
                self.martingale = f64::NAN;
                self.mu = f64::NAN;
            }
        }
        Ok(())
    }

    /// Merge an iterator of sketches into `self` (Algorithm 5 applied
    /// repeatedly). Equivalent to calling [`Self::merge`] on each in
    /// turn but lets you write `sketch.merge_iter(rollups.into_iter())?`
    /// idiomatically, e.g. when rolling up many tenant sketches.
    pub fn merge_iter<'a, I>(&mut self, sketches: I) -> Result<(), MergeError>
    where
        I: IntoIterator<Item = &'a Self>,
    {
        for s in sketches {
            self.merge(s)?;
        }
        Ok(())
    }

    /// Reduce this sketch's precision to `new_p ≤ self.precision()`,
    /// returning a new sketch following Algorithm 6 of the paper
    /// (restricted to keeping `d` constant). The reduced sketch's ML
    /// estimate matches a directly-built sketch at `new_p` to within
    /// the property-test tolerance (~10% rel-diff at `n = 50_000`).
    /// An exact byte-for-byte parity test against Java's `downsize` is
    /// open in the issue tracker.
    pub fn reduce(&self, new_p: u32) -> Self {
        assert!(
            (MIN_P..=MAX_P).contains(&new_p) && new_p <= self.p,
            "new_p={new_p} must be in [{MIN_P}, {self_p}]",
            self_p = self.p
        );
        // For sparse mode, reducing p is equivalent to inserting the same
        // tokens into a fresh sketch at the smaller p — tokens are
        // independent of p (they're built from V=26 hash bits).
        if let Storage::Sparse(tokens) = &self.storage {
            let mut out = Self::new(new_p);
            for &tok in tokens {
                out.add_hash(math::token_to_hash(tok));
            }
            return out;
        }
        let mut out = Self::new_dense(new_p);
        if new_p == self.p {
            for i in 0..self.num_registers() {
                out.write_dense_register(i, self.read_dense_register(i));
            }
            out.martingale_invalid = true;
            return out;
        }
        let p_diff = self.p - new_p;
        let m_new = 1usize << new_p;
        let two_t = 1u32 << T;
        let a = (64 - T - self.p) * two_t + 1;

        for new_i in 0..m_new {
            let mut acc = 0u32;
            for j in 0..(1u64 << p_diff) {
                let old_i = new_i + m_new * j as usize;
                let mut r = self.read_dense_register(old_i);
                let u = r >> D;
                if u >= a {
                    let bit_len_j = if j == 0 { 0 } else { 64 - j.leading_zeros() };
                    let s = (p_diff - bit_len_j) * two_t;
                    if s > 0 {
                        let v = D + a - u;
                        if v > 0 {
                            let high = (r >> v) << v;
                            let low_v = r & ((1u32 << v) - 1);
                            let low_v_shifted = low_v >> s;
                            r = high | low_v_shifted;
                        }
                        r += s << D;
                    }
                }
                acc = math::merge_register(acc, r, D);
            }
            out.write_dense_register(new_i, acc & REGISTER_MASK);
        }
        out.martingale_invalid = true;
        out
    }

    /// Returns `true` if the sketch has seen no inserts. Cheap: in
    /// sparse mode it's a `Vec::is_empty` check; in dense mode it
    /// scans the storage for any non-zero byte.
    #[must_use]
    pub fn is_empty(&self) -> bool {
        match &self.storage {
            Storage::Sparse(tokens) => tokens.is_empty(),
            Storage::Dense(s) => s.iter().all(|&b| b == 0),
        }
    }

    /// Reset to empty (sparse).
    pub fn clear(&mut self) {
        self.storage = Storage::Sparse(Vec::new());
        self.martingale = 0.0;
        self.mu = 1.0;
        self.martingale_invalid = true;
    }

    /// Serialize. Layout: 4-byte magic, 1-byte format version (top bit
    /// set in sparse mode), 1-byte t, 1-byte d, 1-byte p, then either:
    ///
    /// - dense: raw packed register bytes (`m · 7/2`);
    /// - sparse: 4-byte token count followed by `count` little-endian
    ///   `u32` tokens.
    pub fn to_bytes(&self) -> Vec<u8> {
        match &self.storage {
            Storage::Dense(s) => {
                let mut out = Vec::with_capacity(HEADER_LEN + s.len());
                out.extend_from_slice(&MAGIC);
                out.push(FORMAT_VERSION);
                out.push(T as u8);
                out.push(D as u8);
                out.push(self.p as u8);
                out.extend_from_slice(s);
                out
            }
            Storage::Sparse(tokens) => {
                let mut out = Vec::with_capacity(HEADER_LEN + 4 + tokens.len() * 4);
                out.extend_from_slice(&MAGIC);
                out.push(FORMAT_VERSION | FORMAT_FLAG_SPARSE);
                out.push(T as u8);
                out.push(D as u8);
                out.push(self.p as u8);
                out.extend_from_slice(&(tokens.len() as u32).to_le_bytes());
                for &tok in tokens {
                    out.extend_from_slice(&tok.to_le_bytes());
                }
                out
            }
        }
    }

    /// Deserialize from a byte slice produced by [`Self::to_bytes`].
    pub fn from_bytes(bytes: &[u8]) -> Result<Self, DeserializeError> {
        if bytes.len() < HEADER_LEN {
            return Err(DeserializeError::TooShort {
                got: bytes.len(),
                need: HEADER_LEN,
            });
        }
        if bytes[0..4] != MAGIC {
            return Err(DeserializeError::BadMagic);
        }
        let raw_version = bytes[4];
        let is_sparse = raw_version & FORMAT_FLAG_SPARSE != 0;
        let version = raw_version & !FORMAT_FLAG_SPARSE;
        if version != FORMAT_VERSION {
            return Err(DeserializeError::UnsupportedVersion(raw_version));
        }
        let t = bytes[5];
        let d = bytes[6];
        if u32::from(t) != T || u32::from(d) != D {
            return Err(DeserializeError::ParameterMismatch { t, d });
        }
        let p = bytes[7];
        if !(MIN_P..=MAX_P).contains(&u32::from(p)) {
            return Err(DeserializeError::InvalidPrecision(p));
        }

        if is_sparse {
            if bytes.len() < HEADER_LEN + 4 {
                return Err(DeserializeError::TooShort {
                    got: bytes.len(),
                    need: HEADER_LEN + 4,
                });
            }
            let count =
                u32::from_le_bytes(bytes[HEADER_LEN..HEADER_LEN + 4].try_into().unwrap()) as usize;
            let expected = HEADER_LEN + 4 + count * 4;
            if bytes.len() != expected {
                return Err(DeserializeError::LengthMismatch {
                    got: bytes.len(),
                    expected,
                });
            }
            let mut tokens = Vec::with_capacity(count);
            for i in 0..count {
                let off = HEADER_LEN + 4 + i * 4;
                tokens.push(u32::from_le_bytes(bytes[off..off + 4].try_into().unwrap()));
            }
            // The wire format stores tokens as written; we re-sort and
            // dedupe defensively in case an older or hand-rolled writer
            // produced a non-sorted, duplicate-bearing list.
            tokens.sort_unstable();
            tokens.dedup();
            return Ok(Self {
                p: u32::from(p),
                storage: Storage::Sparse(tokens),
                martingale: f64::NAN,
                mu: f64::NAN,
                martingale_invalid: true,
            });
        }

        let m = 1usize << p;
        let expected_len = HEADER_LEN + (m / 2) * BYTES_PER_PAIR;
        if bytes.len() != expected_len {
            return Err(DeserializeError::LengthMismatch {
                got: bytes.len(),
                expected: expected_len,
            });
        }
        let storage: Box<[u8]> = bytes[HEADER_LEN..].to_vec().into_boxed_slice();
        Ok(Self {
            p: u32::from(p),
            storage: Storage::Dense(storage),
            martingale: f64::NAN,
            mu: f64::NAN,
            martingale_invalid: true,
        })
    }
}

/// Two sketches compare equal iff they have the same precision and
/// produce the same dense register state. Both sides are densified
/// internally for the comparison, so a sparse sketch and a dense
/// sketch built from the same inputs compare equal.
impl PartialEq for ExaLogLog {
    fn eq(&self, other: &Self) -> bool {
        if self.p != other.p {
            return false;
        }
        let m = 1usize << self.p;
        (0..m).all(|i| self.get_register(i) == other.get_register(i))
    }
}

impl Eq for ExaLogLog {}

/// `extend(iter_of_u64_hashes)` — convenience for streaming pre-hashed
/// values. Sparse-mode aware: collects into a single bulk sort+dedup
/// pass (`add_hashes`) rather than calling `add_hash` per element.
impl Extend<u64> for ExaLogLog {
    fn extend<I: IntoIterator<Item = u64>>(&mut self, iter: I) {
        let hashes: Vec<u64> = iter.into_iter().collect();
        self.add_hashes(&hashes);
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn splitmix64(mut x: u64) -> u64 {
        x = x.wrapping_add(0x9E37_79B9_7F4A_7C15);
        x = (x ^ (x >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
        x = (x ^ (x >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
        x ^ (x >> 31)
    }

    #[test]
    fn pack_unpack_roundtrip_for_each_register() {
        let p = 6;
        let m = 1usize << p;
        let mut s = ExaLogLog::new_dense(p);
        for i in 0..m {
            let v = ((0xA5A5A5u32.wrapping_mul(i as u32 + 1)) ^ 0x0DEADBu32) & REGISTER_MASK;
            s.write_dense_register(i, v);
        }
        for i in 0..m {
            let v = ((0xA5A5A5u32.wrapping_mul(i as u32 + 1)) ^ 0x0DEADBu32) & REGISTER_MASK;
            assert_eq!(s.get_register(i), v);
        }
    }

    #[test]
    fn writing_register_does_not_disturb_neighbors() {
        let p = 8;
        let mut s = ExaLogLog::new_dense(p);
        for i in 0..s.num_registers() {
            s.write_dense_register(i, REGISTER_MASK);
        }
        for i in (0..s.num_registers()).step_by(2) {
            s.write_dense_register(i, 0);
        }
        for i in 0..s.num_registers() {
            let expected = if i % 2 == 0 { 0 } else { REGISTER_MASK };
            assert_eq!(s.get_register(i), expected, "register {i}");
        }
    }

    #[test]
    fn empty_sketch_estimates_zero() {
        let s = ExaLogLog::new(12);
        assert_eq!(s.estimate(), 0.0);
        assert!(s.is_sparse());
    }

    #[test]
    fn sparse_mode_is_exact_for_small_n() {
        // For n well below break-even, sparse ML estimate should equal n
        // up to round-off (we have the exact set of distinct hash tokens).
        for &n in &[1u64, 5, 10, 50, 100, 500] {
            let mut s = ExaLogLog::new(12);
            for i in 0..n {
                s.add_hash(splitmix64(i));
            }
            assert!(s.is_sparse(), "expected sparse mode at n={n}");
            let est = s.estimate_ml();
            // Rounding tolerance: the bisection solver converges to f64
            // precision but the ML solution for small n is essentially
            // the count itself.
            let rel_err = (est - n as f64).abs() / n as f64;
            assert!(
                rel_err < 0.20,
                "sparse n={n} estimate={est}, rel_err={rel_err}"
            );
        }
    }

    #[test]
    fn auto_promotes_at_break_even() {
        let p = 8;
        let break_even = sparse_capacity(p);
        let mut s = ExaLogLog::new(p);
        for i in 0..(break_even as u64 + 100) {
            s.add_hash(splitmix64(i));
        }
        assert!(!s.is_sparse(), "should have promoted to dense");
    }

    #[test]
    fn ml_estimate_within_error_bounds_after_promotion() {
        let p = 12;
        for &n in &[10_000u64, 100_000, 1_000_000] {
            let mut s = ExaLogLog::new(p);
            for i in 0..n {
                s.add_hash(splitmix64(i));
            }
            assert!(!s.is_sparse(), "n={n} should have promoted");
            let est = s.estimate_ml();
            let rel_err = (est - n as f64).abs() / n as f64;
            assert!(rel_err < 0.05, "n={n}: est={est}, rel_err={rel_err}");
        }
    }

    #[test]
    fn idempotent_inserts_in_sparse_mode_dont_grow_storage() {
        let mut s = ExaLogLog::new(12);
        for _ in 0..1000 {
            s.add_hash(0xDEAD_BEEF_CAFE_BABE);
        }
        assert!(s.is_sparse());
        // Exactly one distinct token regardless of how many duplicates we fed in.
        if let Storage::Sparse(tokens) = &s.storage {
            assert_eq!(tokens.len(), 1);
        }
    }

    #[test]
    fn dense_directly_built_matches_sparse_then_dense() {
        // After auto-promotion, the dense state should match what we'd
        // get from inserting the same elements into a fresh dense sketch.
        let p = 8;
        let break_even = sparse_capacity(p);
        let n = (break_even as u64 + 50) * 3;

        let mut sparse_then_dense = ExaLogLog::new(p);
        let mut dense_only = ExaLogLog::new_dense(p);
        for i in 0..n {
            let h = splitmix64(i);
            sparse_then_dense.add_hash(h);
            dense_only.add_hash(h);
        }
        assert!(!sparse_then_dense.is_sparse());
        for i in 0..(1usize << p) {
            assert_eq!(
                sparse_then_dense.get_register(i),
                dense_only.get_register(i),
                "register {i}"
            );
        }
    }

    #[test]
    fn merge_sparse_with_sparse_unions_tokens() {
        let p = 12;
        let mut a = ExaLogLog::new(p);
        let mut b = ExaLogLog::new(p);
        for i in 0..50u64 {
            a.add_hash(splitmix64(i));
        }
        for i in 30..80u64 {
            b.add_hash(splitmix64(i));
        }
        a.merge(&b).unwrap();
        // Distinct count should be 80.
        let est = a.estimate_ml();
        let rel_err = (est - 80.0).abs() / 80.0;
        assert!(rel_err < 0.05, "merged sparse estimate = {est}");
    }

    #[test]
    fn merge_sparse_with_dense_yields_dense() {
        let p = 8;
        let mut sparse = ExaLogLog::new(p);
        let mut dense = ExaLogLog::new(p);
        for i in 0..50u64 {
            sparse.add_hash(splitmix64(i));
        }
        // Force `dense` past break-even so it's actually dense.
        for i in 0..(sparse_capacity(p) as u64 + 100) {
            dense.add_hash(splitmix64(i + 1_000_000));
        }
        assert!(!dense.is_sparse());
        sparse.merge(&dense).unwrap();
        assert!(!sparse.is_sparse());
    }

    #[test]
    fn serialize_roundtrip_sparse() {
        let p = 12;
        let mut s = ExaLogLog::new(p);
        for i in 0..50u64 {
            s.add_hash(splitmix64(i));
        }
        let est = s.estimate_ml();
        let bytes = s.to_bytes();
        let restored = ExaLogLog::from_bytes(&bytes).unwrap();
        assert!(restored.is_sparse());
        assert!((restored.estimate_ml() - est).abs() < 1e-6);
    }

    #[test]
    fn serialize_roundtrip_dense() {
        let p = 12;
        let mut s = ExaLogLog::new_dense(p);
        for i in 0..50_000u64 {
            s.add_hash(splitmix64(i));
        }
        let est = s.estimate_ml();
        let bytes = s.to_bytes();
        let expected_size = 8 + (1 << p) / 2 * 7;
        assert_eq!(bytes.len(), expected_size);
        let restored = ExaLogLog::from_bytes(&bytes).unwrap();
        assert!(!restored.is_sparse());
        assert!((restored.estimate_ml() - est).abs() < 1e-6);
    }

    #[test]
    fn token_round_trip_preserves_registers_for_p_le_v_minus_t() {
        // For any p ≤ V − t = 24, token reconstruction must give the same
        // (i, k) as the original hash. This is the lossless property
        // sparse mode relies on.
        for p in [3u32, 8, 12, 18, 24] {
            for i in 0..1000u64 {
                let h = splitmix64(i);
                let tok = math::hash_to_token(h);
                let h_recon = math::token_to_hash(tok);
                let (i_orig, k_orig) = math::hash_to_register_k(h, p);
                let (i_recon, k_recon) = math::hash_to_register_k(h_recon, p);
                assert_eq!(i_orig, i_recon, "p={p}, i={i}: index mismatch");
                assert_eq!(k_orig, k_recon, "p={p}, i={i}: k mismatch");
            }
        }
    }

    #[test]
    fn reduce_to_same_p_returns_same_state() {
        let p = 10;
        let mut s = ExaLogLog::new_dense(p);
        for i in 0..10_000u64 {
            s.add_hash(splitmix64(i));
        }
        let r = s.reduce(p);
        for i in 0..s.num_registers() {
            assert_eq!(r.get_register(i), s.get_register(i));
        }
    }

    #[test]
    fn reduce_preserves_estimate_within_tolerance() {
        let p_high = 12;
        let p_low = 10;
        let n = 50_000u64;
        let mut a = ExaLogLog::new_dense(p_high);
        let mut direct = ExaLogLog::new_dense(p_low);
        for i in 0..n {
            let h = splitmix64(i);
            a.add_hash(h);
            direct.add_hash(h);
        }
        let reduced = a.reduce(p_low);
        let red_est = reduced.estimate_ml();
        let dir_est = direct.estimate_ml();
        let rel_diff = (red_est - dir_est).abs() / n as f64;
        assert!(rel_diff < 0.10, "reduced={red_est}, direct={dir_est}");
    }

    #[test]
    fn add_hashes_matches_individual_inserts() {
        for p in [8u32, 12] {
            let n = 50_000u64;
            let mut serial = ExaLogLog::new(p);
            let mut batched = ExaLogLog::new(p);
            let hashes: Vec<u64> = (0..n).map(splitmix64).collect();
            for &h in &hashes {
                serial.add_hash(h);
            }
            batched.add_hashes(&hashes);
            for i in 0..serial.num_registers() {
                assert_eq!(
                    serial.get_register(i),
                    batched.get_register(i),
                    "p={p} register {i} differs"
                );
            }
            assert!((serial.estimate_ml() - batched.estimate_ml()).abs() < 1e-6);
        }
    }

    #[test]
    fn with_target_rmse_picks_appropriate_p() {
        let s_2pct = ExaLogLog::with_target_rmse(0.02);
        // RMSE_target = 2%: m_needed = 3.67 / (28 * 0.02²) = 327.7 → p ≥ 9
        assert!(s_2pct.precision() >= 8);
        let s_1pct = ExaLogLog::with_target_rmse(0.01);
        assert!(s_1pct.precision() > s_2pct.precision());
        let s_0_5pct = ExaLogLog::with_target_rmse(0.005);
        assert!(s_0_5pct.precision() > s_1pct.precision());
    }

    #[test]
    fn add_hashes_sorted_matches_serial() {
        let p = 12;
        let n = 100_000u64;
        let mut serial = ExaLogLog::new_dense(p);
        let mut sorted = ExaLogLog::new_dense(p);
        let hashes: Vec<u64> = (0..n).map(splitmix64).collect();
        for &h in &hashes {
            serial.add_hash(h);
        }
        sorted.add_hashes_sorted(&hashes);
        for i in 0..serial.num_registers() {
            assert_eq!(
                serial.get_register(i),
                sorted.get_register(i),
                "register {i}"
            );
        }
    }

    #[test]
    fn merge_iter_matches_repeated_merge() {
        let p = 10;
        let mut targets: Vec<ExaLogLog> = (0..5)
            .map(|tid| {
                let mut s = ExaLogLog::new_dense(p);
                for i in (tid * 1000)..((tid + 1) * 1000) {
                    s.add_hash(splitmix64(i));
                }
                s
            })
            .collect();
        let head = targets.remove(0);

        // Path 1: repeated merge.
        let mut a = head.clone();
        for s in &targets {
            a.merge(s).unwrap();
        }
        // Path 2: merge_iter.
        let mut b = head;
        b.merge_iter(targets.iter()).unwrap();

        for i in 0..a.num_registers() {
            assert_eq!(a.get_register(i), b.get_register(i));
        }
    }

    #[test]
    fn add_hashes_in_sparse_mode_dedupes() {
        let mut s = ExaLogLog::new(12);
        let hashes: Vec<u64> = (0..50u64).chain(0..50).chain(0..50).map(splitmix64).collect();
        s.add_hashes(&hashes);
        assert!(s.is_sparse());
        let est = s.estimate_ml();
        let rel_err = (est - 50.0).abs() / 50.0;
        assert!(rel_err < 0.20, "expected ~50, got {est}");
    }

    #[test]
    fn min_p_works() {
        let mut s = ExaLogLog::new_dense(MIN_P);
        for i in 0..1000u64 {
            s.add_hash(splitmix64(i));
        }
        assert!(s.estimate_ml().is_finite());
    }

    #[test]
    fn max_p_skips_sparse() {
        // p > MAX_P_SPARSE = 24 → start dense. This is what we want
        // because the 32-bit sparse-token format is lossy past p+t = V.
        let s = ExaLogLog::new(25);
        assert!(!s.is_sparse(), "p=25 should skip sparse mode");
        let s = ExaLogLog::new(MAX_P);
        assert!(!s.is_sparse(), "p=MAX_P should skip sparse mode");
    }

    #[test]
    fn p_24_still_uses_sparse() {
        // The boundary case: p=24 + t=2 = V=26, still safe.
        let s = ExaLogLog::new(MAX_P_SPARSE);
        assert!(s.is_sparse(), "p={MAX_P_SPARSE} should still be sparse");
    }

    #[test]
    fn empty_sketch_is_empty() {
        let s = ExaLogLog::new(12);
        assert!(s.is_empty());
        let s = ExaLogLog::new_dense(12);
        assert!(s.is_empty());
    }

    #[test]
    fn populated_sketch_is_not_empty() {
        let mut s = ExaLogLog::new(12);
        s.add_hash(0xDEAD_BEEF);
        assert!(!s.is_empty());
        s.densify();
        assert!(!s.is_empty());
    }

    #[test]
    fn extend_matches_add_hashes() {
        let p = 12;
        let mut a = ExaLogLog::new_dense(p);
        let mut b = ExaLogLog::new_dense(p);
        let hashes: Vec<u64> = (0..50_000u64).map(splitmix64).collect();
        a.add_hashes(&hashes);
        b.extend(hashes.iter().copied());
        assert_eq!(a, b);
    }

    #[test]
    fn equality_matches_register_state() {
        let p = 10;
        let mut a = ExaLogLog::new(p);
        let mut b = ExaLogLog::new_dense(p);
        for i in 0..100u64 {
            let h = splitmix64(i);
            a.add_hash(h);
            b.add_hash(h);
        }
        // a is sparse, b is dense; they should still compare equal.
        assert!(a.is_sparse());
        assert!(!b.is_sparse());
        assert_eq!(a, b);
    }

    #[test]
    fn different_p_compare_unequal() {
        let a = ExaLogLog::new(10);
        let b = ExaLogLog::new(11);
        assert_ne!(a, b);
    }

    #[test]
    fn is_send_and_sync() {
        fn assert_send<T: Send>() {}
        fn assert_sync<T: Sync>() {}
        assert_send::<ExaLogLog>();
        assert_sync::<ExaLogLog>();
    }

    #[test]
    fn memory_is_43_percent_smaller_than_hll_6bit() {
        for p in [8u32, 10, 12, 14] {
            let m = 1usize << p;
            let s = ExaLogLog::new_dense(p);
            assert_eq!(s.register_bytes(), m * 7 / 2);
        }
    }
}