exaloglog 0.15.0

ExaLogLog: space-efficient approximate distinct counting (Ertl 2024). 43% smaller than HyperLogLog with the same estimation error.
Documentation
//! ExaLogLog: space-efficient approximate distinct counting.
//!
//! Implementation of the ExaLogLog (ELL) algorithm by Otmar Ertl,
//! "ExaLogLog: Space-Efficient and Practical Approximate Distinct
//! Counting up to the Exa-Scale", arXiv:2402.13726 / EDBT 2025.
//!
//! Compared to HyperLogLog with 6-bit registers, ExaLogLog reaches the
//! same estimation error with **~43% less in-memory state** (the
//! [`ExaLogLog`] packed `d = 20` variant, MVP = 3.67) or **~41% less**
//! with 32-bit aligned registers ([`ExaLogLogFast`], `d = 24`,
//! MVP = 3.78) that are easier to access concurrently.
//!
//! Both variants:
//!
//! - support distinct counts up to the exa-scale (10¹⁹+);
//! - are mergeable, idempotent, and reducible;
//! - have constant-time inserts independent of `p`;
//! - implement martingale (HIP) and maximum-likelihood estimators
//!   (ML solver: paper Algorithm 8 with bisection fallback);
//! - start in **sparse mode** (token list) and auto-promote to the
//!   dense register array at the per-variant break-even point;
//! - serialize through [`Self::to_bytes`] / [`Self::from_bytes`] with
//!   wire-format version checking;
//! - validate byte-for-byte against the Dynatrace Java reference
//!   ([dynatrace-research/exaloglog-paper](https://github.com/dynatrace-research/exaloglog-paper))
//!   in `tests/java_parity.rs`.
//!
//! # Choosing a variant
//!
//! - Use [`ExaLogLog`] (the packed default) for the smallest memory
//!   footprint at a given accuracy. Single-threaded ingest only.
//! - Use [`ExaLogLogFast`] when you need lock-free concurrent ingest
//!   via [`ExaLogLogFast::add_hash_atomic`], or when you prefer
//!   `u32`-aligned register access.
//!
//! # Example
//!
//! ```
//! use exaloglog::ExaLogLog;
//!
//! let mut sketch = ExaLogLog::new(12);
//! for i in 0..100_000u64 {
//!     sketch.add(&i);
//! }
//! let estimate = sketch.estimate();
//! assert!((estimate - 100_000.0).abs() / 100_000.0 < 0.05);
//! ```
//!
//! # Optional features
//!
//! - `serde` — `Serialize` / `Deserialize` impls going through the
//!   wire format. Works with bincode, JSON, MessagePack, CBOR, etc.
//! - `rayon` — parallel rollups via [`merge_many_par`] /
//!   [`merge_many_par_fast`].
//! - `simd` — x86_64 SIMD batch path (AVX2 + BMI1 + LZCNT, with an
//!   AVX-512 + CD upgrade when both are detected). Gated by runtime
//!   feature detection. The `simd` feature effectively raises the
//!   crate's MSRV to 1.89 (when AVX-512 intrinsics stabilized).

#![warn(missing_docs)]
#![deny(unsafe_code)]

mod fast;
mod math;
mod packed;
#[cfg(all(target_arch = "x86_64", feature = "simd"))]
#[allow(unsafe_code)]
mod simd_x86;
#[cfg(feature = "rayon")]
mod rayon_impl;
#[cfg(feature = "serde")]
mod serde_impl;

#[cfg(feature = "rayon")]
pub use rayon_impl::{merge_many_par, merge_many_par_fast};

pub use fast::ExaLogLogFast;
pub use packed::ExaLogLog;

/// `t` parameter of the approximated update value distribution. Fixed at 2
/// in this build — both [`ExaLogLog`] (`d=20`) and [`ExaLogLogFast`] (`d=24`)
/// use it, and it's the configuration the paper recommends in practice.
pub const T: u32 = 2;

/// Minimum supported precision parameter `p`.
pub const MIN_P: u32 = 3;
/// Maximum supported precision parameter `p`.
pub const MAX_P: u32 = 26;

/// Magic prefix used by the serializers. ASCII `"ELL\0"`.
pub(crate) const MAGIC: [u8; 4] = *b"ELL\0";
/// Format version. Bumped on incompatible layout changes.
pub(crate) const FORMAT_VERSION: u8 = 1;

/// Error returned when two sketches cannot be merged.
#[derive(Debug, PartialEq, Eq)]
pub enum MergeError {
    /// The two sketches were created with different `p` values.
    PrecisionMismatch {
        /// Precision of `self`.
        lhs: u32,
        /// Precision of `other`.
        rhs: u32,
    },
}

impl std::fmt::Display for MergeError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            MergeError::PrecisionMismatch { lhs, rhs } => {
                write!(f, "precision mismatch: lhs p={lhs}, rhs p={rhs}")
            }
        }
    }
}

impl std::error::Error for MergeError {}

/// Error returned when deserializing a byte slice into an ExaLogLog sketch.
#[derive(Debug, PartialEq, Eq)]
pub enum DeserializeError {
    /// Byte slice is shorter than the minimum header length.
    TooShort {
        /// Bytes received.
        got: usize,
        /// Bytes required (at minimum).
        need: usize,
    },
    /// Magic prefix did not match.
    BadMagic,
    /// Format version is not supported by this build.
    UnsupportedVersion(u8),
    /// The encoded `t` or `d` parameter does not match what this type expects.
    /// `ExaLogLog` accepts `(t=2, d=20)`; `ExaLogLogFast` accepts `(t=2, d=24)`.
    ParameterMismatch {
        /// Encoded `t`.
        t: u8,
        /// Encoded `d`.
        d: u8,
    },
    /// Encoded `p` is outside the supported range.
    InvalidPrecision(u8),
    /// Length of the byte slice does not match what `p` implies.
    LengthMismatch {
        /// Bytes received.
        got: usize,
        /// Bytes expected, given the encoded `p`.
        expected: usize,
    },
}

impl std::fmt::Display for DeserializeError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            DeserializeError::TooShort { got, need } => {
                write!(f, "byte slice too short: got {got}, need at least {need}")
            }
            DeserializeError::BadMagic => write!(f, "bad magic prefix"),
            DeserializeError::UnsupportedVersion(v) => {
                write!(f, "unsupported format version: {v}")
            }
            DeserializeError::ParameterMismatch { t, d } => {
                write!(f, "parameter mismatch: encoded t={t}, d={d}")
            }
            DeserializeError::InvalidPrecision(p) => {
                write!(f, "invalid precision p={p} (allowed: {MIN_P}..={MAX_P})")
            }
            DeserializeError::LengthMismatch { got, expected } => {
                write!(f, "length mismatch: got {got} bytes, expected {expected}")
            }
        }
    }
}

impl std::error::Error for DeserializeError {}