Skip to main content

card_est_array/impls/
hyper_log_log8.rs

1/*
2 * SPDX-FileCopyrightText: 2025 Sebastiano Vigna
3 *
4 * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
5 */
6
7use super::DefaultEstimator;
8use super::hyper_log_log::apply_correction;
9use crate::traits::{
10    EstimationLogic, MergeEstimationLogic, SliceEstimationLogic, assert_backend_len,
11};
12use std::borrow::Borrow;
13use std::hash::*;
14use std::{fmt, marker::PhantomData};
15
16/// Estimation logic implementing the HyperLogLog algorithm with byte-sized
17/// registers.
18///
19/// This implementation uses a full byte for each register instead of packed 5–
20/// or 6-bit registers. This approach trades 60% (for 5-bit registers) or 33.3%
21/// (for 6-bit registers) extra space with respect to
22/// [`HyperLogLog`](super::HyperLogLog) for:
23///
24/// - fast register access (byte indexing instead of bit-field extraction);
25/// - no backend alignment constraints;
26/// - SIMD-accelerated merge using platform-specific byte-wise maximum
27///   instructions (SSE2 on `x86`/`x86_64`, NEON on `aarch64`);
28/// - no temporary allocations for merge operations.
29///
30/// The choice between the two logics should be guided by the specific use case
31/// and constraints of your application. Please try the included benchmarks to
32/// have an idea of the difference in performance between the two logics in your
33/// environment.
34///
35/// Instances are created through [`HyperLogLog8Builder`]:
36///
37/// ```
38/// # use card_est_array::impls::HyperLogLog8Builder;
39/// // Default: LogLog-β correction enabled
40/// let logic = HyperLogLog8Builder::new()
41///     .log2_num_regs(8)
42///     .build::<String>();
43///
44/// // Disable LogLog-β, use classic HLL + linear-counting fallback
45/// let logic = HyperLogLog8Builder::new()
46///     .log2_num_regs(8)
47///     .beta::<false>()
48///     .build::<String>();
49/// ```
50///
51/// # Type parameters
52///
53/// - `T`: the type of elements to count (must implement [`Hash`]).
54///
55/// - `H`: the [`BuildHasher`] used to hash elements.
56///
57/// - `BETA`: when `true` (the default), the
58///   [LogLog-β](super::hyper_log_log::beta_horner) bias correction is used
59///   during estimation. See [`HyperLogLog`](super::HyperLogLog) for details.
60#[derive(Debug, PartialEq)]
61pub struct HyperLogLog8<T, H, const BETA: bool = true> {
62    build_hasher: H,
63    num_regs_minus_1: u64,
64    log2_num_regs: u32,
65    num_regs: usize,
66    alpha_m_m: f64,
67    _marker: PhantomData<T>,
68}
69
70// We implement Clone manually because we do not want to require that T is
71// Clone.
72impl<T, H: Clone, const BETA: bool> Clone for HyperLogLog8<T, H, BETA> {
73    fn clone(&self) -> Self {
74        Self {
75            build_hasher: self.build_hasher.clone(),
76            num_regs_minus_1: self.num_regs_minus_1,
77            log2_num_regs: self.log2_num_regs,
78            num_regs: self.num_regs,
79            alpha_m_m: self.alpha_m_m,
80            _marker: PhantomData,
81        }
82    }
83}
84
85impl<T, H: Clone, const BETA: bool> HyperLogLog8<T, H, BETA> {
86    /// Returns the base-2 logarithm of the number of registers per estimator.
87    pub fn log2_num_regs(&self) -> u32 {
88        self.log2_num_regs
89    }
90}
91
92impl<T: Hash, H: BuildHasher + Clone, const BETA: bool> SliceEstimationLogic<u8>
93    for HyperLogLog8<T, H, BETA>
94{
95    #[inline(always)]
96    fn backend_len(&self) -> usize {
97        self.num_regs
98    }
99}
100
101impl<T: Hash, H: BuildHasher + Clone, const BETA: bool> EstimationLogic
102    for HyperLogLog8<T, H, BETA>
103{
104    type Item = T;
105    type Backend = [u8];
106    type Estimator<'a>
107        = DefaultEstimator<Self, &'a Self, Box<[u8]>>
108    where
109        T: 'a,
110        H: 'a;
111
112    fn new_estimator(&self) -> Self::Estimator<'_> {
113        Self::Estimator::new(self, vec![0u8; self.num_regs].into_boxed_slice())
114    }
115
116    fn add(&self, backend: &mut Self::Backend, element: impl Borrow<T>) {
117        assert_backend_len!(self, backend);
118        let hash = self.build_hasher.hash_one(element.borrow());
119        let register = (hash & self.num_regs_minus_1) as usize;
120        let r = hash.rotate_right(self.log2_num_regs).trailing_zeros();
121
122        debug_assert!(register < self.num_regs);
123
124        backend[register] = backend[register].max(r as u8 + 1);
125    }
126
127    fn estimate(&self, backend: &[u8]) -> f64 {
128        assert_backend_len!(self, backend);
129        let mut harmonic_mean = 0.0;
130        let mut zeroes = 0usize;
131
132        for &value in backend {
133            if value == 0 {
134                zeroes += 1;
135            }
136            // 2⁻ᵛ via IEEE 754: exponent = 1023 − v, zero mantissa.
137            harmonic_mean += f64::from_bits((1023 - value as u64) << 52);
138        }
139
140        apply_correction::<BETA>(
141            harmonic_mean,
142            zeroes,
143            self.num_regs,
144            self.log2_num_regs,
145            self.alpha_m_m,
146        )
147    }
148
149    #[inline(always)]
150    fn clear(&self, backend: &mut [u8]) {
151        backend.fill(0);
152    }
153
154    #[inline(always)]
155    fn set(&self, dst: &mut [u8], src: &[u8]) {
156        debug_assert_eq!(dst.len(), src.len());
157        dst.copy_from_slice(src);
158    }
159}
160
161impl<T: Hash, H: BuildHasher + Clone, const BETA: bool> MergeEstimationLogic
162    for HyperLogLog8<T, H, BETA>
163{
164    type Helper = ();
165
166    fn new_helper(&self) -> Self::Helper {}
167
168    #[inline(always)]
169    fn merge_with_helper(&self, dst: &mut [u8], src: &[u8], _helper: &mut Self::Helper) {
170        debug_assert_eq!(dst.len(), src.len());
171        merge_max_bytes(dst, src);
172    }
173}
174
175impl<T, H, const BETA: bool> fmt::Display for HyperLogLog8<T, H, BETA> {
176    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
177        write!(
178            f,
179            "HyperLogLog8 with relative standard deviation: {}% ({} registers/estimator, 8 bits/register, {} bytes/estimator)",
180            100.0 * super::HyperLogLog::rel_std(self.log2_num_regs),
181            self.num_regs,
182            self.num_regs,
183        )
184    }
185}
186
187/// Builder for [`HyperLogLog8`] cardinality-estimation logic.
188///
189/// The builder lets you configure:
190/// - the number of registers, either directly
191///   ([`log2_num_regs`](Self::log2_num_regs)) or via a target relative
192///   standard deviation ([`rsd`](Self::rsd));
193/// - the hash function ([`build_hasher`](Self::build_hasher));
194/// - whether [LogLog-β bias correction](super::hyper_log_log::beta_horner) is
195///   enabled ([`beta`](Self::beta)).
196///
197/// Call [`build`](Self::build) to obtain the configured [`HyperLogLog8`]
198/// logic.
199#[derive(Debug, Clone)]
200pub struct HyperLogLog8Builder<H, const BETA: bool = true> {
201    build_hasher: H,
202    log2_num_regs: u32,
203}
204
205impl HyperLogLog8Builder<BuildHasherDefault<DefaultHasher>> {
206    /// Creates a new builder for a [`HyperLogLog8`] logic.
207    pub const fn new() -> Self {
208        Self {
209            build_hasher: BuildHasherDefault::new(),
210            log2_num_regs: 4,
211        }
212    }
213}
214
215impl Default for HyperLogLog8Builder<BuildHasherDefault<DefaultHasher>> {
216    fn default() -> Self {
217        Self::new()
218    }
219}
220
221impl<H, const BETA: bool> HyperLogLog8Builder<H, BETA> {
222    /// Sets the desired relative standard deviation.
223    ///
224    /// This is a high-level alternative to [`Self::log2_num_regs`]. Calling one
225    /// after the other invalidates the work done by the first one.
226    ///
227    /// # Arguments
228    /// * `rsd`: the relative standard deviation to be attained.
229    ///
230    /// # Panics
231    ///
232    /// If the resulting number of registers is less than 16 (i.e., `rsd` is
233    /// too large).
234    pub fn rsd(self, rsd: f64) -> Self {
235        self.log2_num_regs(super::HyperLogLog::log2_num_of_registers(rsd))
236    }
237
238    /// Sets the base-2 logarithm of the number of registers.
239    ///
240    /// This is a low-level alternative to [`Self::rsd`]. Calling one after the
241    /// other invalidates the work done by the first one.
242    ///
243    /// # Arguments
244    /// * `log2_num_regs`: the logarithm of the number of registers per
245    ///   estimator.
246    ///
247    /// # Panics
248    ///
249    /// If `log2_num_regs` is less than 4.
250    pub const fn log2_num_regs(mut self, log2_num_regs: u32) -> Self {
251        assert!(
252            log2_num_regs >= 4,
253            "the logarithm of the number of registers per estimator should be at least 4"
254        );
255        self.log2_num_regs = log2_num_regs;
256        self
257    }
258
259    /// Enables or disables the [LogLog-β bias
260    /// correction](super::hyper_log_log::beta_horner) in the estimate.
261    ///
262    /// See [`HyperLogLog8`] for details.
263    pub fn beta<const BETA2: bool>(self) -> HyperLogLog8Builder<H, BETA2> {
264        HyperLogLog8Builder {
265            build_hasher: self.build_hasher,
266            log2_num_regs: self.log2_num_regs,
267        }
268    }
269
270    /// Sets the [`BuildHasher`] to use.
271    ///
272    /// Using this method you can select a specific hasher based on one or more
273    /// seeds.
274    pub fn build_hasher<H2>(self, build_hasher: H2) -> HyperLogLog8Builder<H2, BETA> {
275        HyperLogLog8Builder {
276            log2_num_regs: self.log2_num_regs,
277            build_hasher,
278        }
279    }
280
281    /// Builds the logic.
282    ///
283    /// The type of objects the estimators keep track of is defined here by `T`,
284    /// but it is usually inferred by the compiler.
285    pub fn build<T>(self) -> HyperLogLog8<T, H, BETA> {
286        let log2_num_regs = self.log2_num_regs;
287        let num_regs = 1usize << log2_num_regs;
288        let alpha = match log2_num_regs {
289            4 => 0.673,
290            5 => 0.697,
291            6 => 0.709,
292            _ => 0.7213 / (1.0 + 1.079 / num_regs as f64),
293        };
294
295        HyperLogLog8 {
296            num_regs,
297            num_regs_minus_1: (num_regs - 1) as u64,
298            log2_num_regs,
299            alpha_m_m: alpha * (num_regs as f64).powi(2),
300            build_hasher: self.build_hasher,
301            _marker: PhantomData,
302        }
303    }
304}
305
306// ─── SIMD merge: byte-wise maximum ─────────────────────────────────────
307
308#[allow(dead_code)]
309fn merge_max_bytes_scalar(dst: &mut [u8], src: &[u8]) {
310    for (d, &s) in dst.iter_mut().zip(src.iter()) {
311        *d = (*d).max(s);
312    }
313}
314
315#[cfg(target_arch = "x86")]
316use std::arch::x86::*;
317#[cfg(target_arch = "x86_64")]
318use std::arch::x86_64::*;
319
320#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
321#[target_feature(enable = "sse2")]
322unsafe fn merge_max_bytes_sse2(dst: &mut [u8], src: &[u8]) {
323    debug_assert_eq!(dst.len(), src.len());
324    debug_assert!(dst.len() % 16 == 0);
325
326    let n = dst.len();
327    let dst_ptr = dst.as_mut_ptr();
328    let src_ptr = src.as_ptr();
329    let mut i = 0;
330    while i < n {
331        // SAFETY: i + 16 <= n because n is a multiple of 16 and i steps by 16.
332        unsafe {
333            let a = _mm_loadu_si128(dst_ptr.add(i) as *const __m128i);
334            let b = _mm_loadu_si128(src_ptr.add(i) as *const __m128i);
335            let max = _mm_max_epu8(a, b);
336            _mm_storeu_si128(dst_ptr.add(i) as *mut __m128i, max);
337        }
338        i += 16;
339    }
340}
341
342#[cfg(target_arch = "aarch64")]
343use std::arch::aarch64::*;
344
345#[cfg(target_arch = "aarch64")]
346#[target_feature(enable = "neon")]
347unsafe fn merge_max_bytes_neon(dst: &mut [u8], src: &[u8]) {
348    debug_assert_eq!(dst.len(), src.len());
349    debug_assert!(dst.len() % 16 == 0);
350
351    let n = dst.len();
352    let dst_ptr = dst.as_mut_ptr();
353    let src_ptr = src.as_ptr();
354    let mut i = 0;
355    while i < n {
356        // SAFETY: i + 16 <= n because n is a multiple of 16 and i steps by 16.
357        unsafe {
358            let a = vld1q_u8(dst_ptr.add(i));
359            let b = vld1q_u8(src_ptr.add(i));
360            let max = vmaxq_u8(a, b);
361            vst1q_u8(dst_ptr.add(i), max);
362        }
363        i += 16;
364    }
365}
366
367#[cfg(target_arch = "x86_64")]
368fn merge_max_bytes(dst: &mut [u8], src: &[u8]) {
369    // SAFETY: SSE2 is always available on x86_64.
370    unsafe { merge_max_bytes_sse2(dst, src) }
371}
372
373#[cfg(target_arch = "x86")]
374fn merge_max_bytes(dst: &mut [u8], src: &[u8]) {
375    if is_x86_feature_detected!("sse2") {
376        // SAFETY: we just verified SSE2 is available.
377        unsafe { merge_max_bytes_sse2(dst, src) }
378    } else {
379        merge_max_bytes_scalar(dst, src)
380    }
381}
382
383#[cfg(target_arch = "aarch64")]
384fn merge_max_bytes(dst: &mut [u8], src: &[u8]) {
385    // SAFETY: NEON is always available on aarch64.
386    unsafe { merge_max_bytes_neon(dst, src) }
387}
388
389#[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))]
390fn merge_max_bytes(dst: &mut [u8], src: &[u8]) {
391    merge_max_bytes_scalar(dst, src)
392}