Skip to main content

card_est_array/impls/
hyper_log_log.rs

1/*
2 * SPDX-FileCopyrightText: 2024 Matteo Dell'Acqua
3 * SPDX-FileCopyrightText: 2025 Sebastiano Vigna
4 *
5 * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
6 */
7
8use super::DefaultEstimator;
9use crate::traits::Word;
10use crate::traits::{
11    EstimationLogic, MergeEstimationLogic, SliceEstimationLogic, assert_backend_len,
12};
13use num_primitive::{PrimitiveNumber, PrimitiveNumberAs};
14use std::hash::*;
15use std::num::NonZeroUsize;
16use std::{borrow::Borrow, f64::consts::LN_2, fmt};
17
18/// Error returned by [`HyperLogLogBuilder::build`] when the configuration is
19/// invalid.
20#[derive(Debug, Clone)]
21pub enum HyperLogLogError {
22    /// The register size derived from `num_elements` exceeds the maximum
23    /// supported by the hash type.
24    RegisterSizeTooLarge {
25        register_size: usize,
26        num_elements: usize,
27        hash_bits: u32,
28    },
29    /// The estimator size in bits is not divisible by the bit width of the
30    /// backend word type `W`, so registers cannot be packed exactly into whole
31    /// words.
32    UnalignedBackend {
33        est_size_in_bits: usize,
34        word_bits: usize,
35        min_alignment: String,
36        min_log2_num_regs: u32,
37    },
38}
39
40impl fmt::Display for HyperLogLogError {
41    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
42        match self {
43            Self::RegisterSizeTooLarge {
44                register_size,
45                num_elements,
46                hash_bits,
47            } => write!(
48                f,
49                "register size {} (derived from num_elements = {}) exceeds the maximum of 6 \
50                supported with a {}-bit hash type; reduce num_elements or use a hash with more bits",
51                register_size, num_elements, hash_bits,
52            ),
53            Self::UnalignedBackend {
54                est_size_in_bits,
55                word_bits,
56                min_alignment,
57                min_log2_num_regs,
58            } => write!(
59                f,
60                "estimator size ({} bits) is not divisible by the word size ({} bits), \
61                so register backends cannot be aligned; use {} or a smaller unsigned integer type \
62                as the word type, or increase log2_num_regs (possibly by reducing \
63                the relative standard deviation) to at least {}",
64                est_size_in_bits, word_bits, min_alignment, min_log2_num_regs,
65            ),
66        }
67    }
68}
69
70impl std::error::Error for HyperLogLogError {}
71
72/// The type returned by the hash function.
73type HashResult = u64;
74
75/// LogLog-β polynomial coefficients for `log2_num_regs` in [4 . . 18].
76///
77/// Indexed as `LOGLOG_BETA[log2_num_regs - 4]`. Each row holds β₀, β₁, …,
78/// β₇ used by [`beta_horner`].
79const LOGLOG_BETA: [[f64; 8]; 15] = [
80    // log2_num_regs =  4
81    [
82        -0.582581413904517,
83        -1.935_300_357_560_05,
84        11.079323758035073,
85        -22.131357446444323,
86        22.505391846630037,
87        -12.000723834917984,
88        3.220579408194167,
89        -0.342225302271235,
90    ],
91    // log2_num_regs =  5
92    [
93        -0.7518999460733967,
94        -0.959_003_007_774_876,
95        5.599_737_132_214_161,
96        -8.209_763_699_976_552,
97        6.509_125_489_447_204,
98        -2.683_029_373_432_373,
99        0.5612891113138221,
100        -0.0463331622196545,
101    ],
102    // log2_num_regs =  6
103    [
104        29.825_790_096_961_963,
105        -31.328_708_333_772_592,
106        -10.594_252_303_658_228,
107        -11.572_012_568_909_962,
108        3.818_875_437_390_749,
109        -2.416_013_032_853_081,
110        0.4542208940970826,
111        -0.0575155452020420,
112    ],
113    // log2_num_regs =  7
114    [
115        2.810_292_129_082_006,
116        -3.9780498518175995,
117        1.3162680041351582,
118        -3.925_248_633_580_59,
119        2.008_083_575_394_647,
120        -0.7527151937556955,
121        0.1265569894242751,
122        -0.0109946438726240,
123    ],
124    // log2_num_regs =  8
125    [
126        1.006_335_448_875_505_2,
127        -2.005_806_664_051_124,
128        1.643_697_493_665_141_2,
129        -2.705_608_099_405_661_7,
130        1.392_099_802_442_226,
131        -0.464_703_742_721_831_9,
132        0.07384282377269775,
133        -0.00578554885254223,
134    ],
135    // log2_num_regs =  9
136    [
137        -0.09415657458167959,
138        -0.781_309_759_245_505_3,
139        1.715_149_467_507_124_6,
140        -1.737_112_504_065_163_4,
141        0.864_415_084_890_489_2,
142        -0.23819027465047218,
143        0.03343448400269076,
144        -0.00207858528178157,
145    ],
146    // log2_num_regs =  10
147    [
148        -0.25935400670790054,
149        -0.525_983_019_998_058_1,
150        1.489_330_349_258_768_4,
151        -1.296_427_140_849_935_7,
152        0.622_847_562_172_216_2,
153        -0.156_723_267_702_510_4,
154        0.02054415903878563,
155        -0.00112488483925502,
156    ],
157    // log2_num_regs =  11
158    [
159        -4.32325553856025e-01,
160        -1.08450736399632e-01,
161        6.09156550741120e-01,
162        -1.65687801845180e-02,
163        -7.95829341087617e-02,
164        4.71830602102918e-02,
165        -7.81372902346934e-03,
166        5.84268708489995e-04,
167    ],
168    // log2_num_regs =  12
169    [
170        -3.84979202588598e-01,
171        1.83162233114364e-01,
172        1.30396688841854e-01,
173        7.04838927629266e-02,
174        -8.95893971464453e-03,
175        1.13010036741605e-02,
176        -1.94285569591290e-03,
177        2.25435774024964e-04,
178    ],
179    // log2_num_regs =  13
180    [
181        -0.41655270946462997,
182        -0.22146677040685156,
183        0.38862131236999947,
184        0.453_409_797_460_623_7,
185        -0.36264738324476375,
186        0.12304650053558529,
187        -0.017_015_403_845_555_1,
188        0.00102750367080838,
189    ],
190    // log2_num_regs =  14
191    [
192        -3.71009760230692e-01,
193        9.78811941207509e-03,
194        1.85796293324165e-01,
195        2.03015527328432e-01,
196        -1.16710521803686e-01,
197        4.31106699492820e-02,
198        -5.99583540511831e-03,
199        4.49704299509437e-04,
200    ],
201    // log2_num_regs = 15
202    [
203        -0.38215145543875273,
204        -0.890_694_005_360_908_4,
205        0.376_023_357_746_788_7,
206        0.993_359_774_406_823_8,
207        -0.655_774_416_383_189_6,
208        0.183_323_421_297_036_1,
209        -0.02241529633062872,
210        0.00121399789330194,
211    ],
212    // log2_num_regs = 16
213    [
214        -0.373_318_766_437_530_6,
215        -1.417_040_774_481_23,
216        0.40729184796612533,
217        1.561_520_339_065_841_6,
218        -0.992_422_335_342_861_3,
219        0.260_646_813_994_830_9,
220        -0.03053811369682807,
221        0.00155770210179105,
222    ],
223    // log2_num_regs = 17
224    [
225        -0.36775502299404605,
226        0.538_314_223_513_779_7,
227        0.769_702_892_787_679_2,
228        0.550_025_835_864_505_6,
229        -0.745_755_882_611_469_4,
230        0.257_118_357_858_219_5,
231        -0.03437902606864149,
232        0.00185949146371616,
233    ],
234    // log2_num_regs = 18
235    [
236        -0.364_796_233_259_605_4,
237        0.997_304_123_286_350_3,
238        1.553_543_862_300_812_2,
239        1.259_326_771_980_289_2,
240        -1.533_259_482_091_101_6,
241        0.478_010_422_000_565_9,
242        -0.05951025172951174,
243        0.00291076804642205,
244    ],
245];
246
247/// Computes the LogLog-β bias correction using Horner's method.
248///
249/// The method appears in the paper from Jason Qin, Denys Kim & Yumei Tung,
250/// “[LogLog-Beta and More: A New Algorithm for Cardinality Estimation Based on
251/// LogLog Counting](https://arxiv.org/pdf/1612.02284)”, 2016.
252///
253/// # Panics
254///
255/// If `log2_num_regs` is not in [4 . . 18].
256pub fn beta_horner(z: f64, log2_num_regs: usize) -> f64 {
257    let beta = LOGLOG_BETA[log2_num_regs - 4];
258    let zl = (z + 1.0).ln();
259    let mut res = 0.0;
260    for i in (1..8).rev() {
261        res = res * zl + beta[i];
262    }
263    res * zl + beta[0] * z
264}
265
266/// Applies the post-loop correction to the harmonic mean and zero count.
267///
268/// This function is `#[inline(never)]` so that the register-iteration loop
269/// in [`estimate`](EstimationLogic::estimate) compiles identically for both
270/// `BETA=true` and `BETA=false`, preventing the post-loop formula from
271/// influencing loop optimizations (vectorization, unrolling, scheduling).
272#[inline(never)]
273pub(crate) fn apply_correction<const BETA: bool>(
274    harmonic_mean: f64,
275    zeroes: usize,
276    num_regs: usize,
277    log2_num_regs: u32,
278    alpha_m_m: f64,
279) -> f64 {
280    if BETA && zeroes != 0 && log2_num_regs <= 18 {
281        // LogLog-β: a single formula that replaces both the raw HyperLogLog
282        // estimate and the linear-counting small-range correction.
283        let m = num_regs as f64;
284        let z = zeroes as f64;
285        let beta = beta_horner(z, log2_num_regs as usize);
286        alpha_m_m * (m - z) / (m * (harmonic_mean + beta))
287    } else {
288        // Classic HyperLogLog raw estimate with linear-counting correction.
289        let mut estimate = alpha_m_m / harmonic_mean;
290        if zeroes != 0 && estimate < 2.5 * num_regs as f64 {
291            let m = num_regs as f64;
292            estimate = m * (m / zeroes as f64).ln();
293        }
294        estimate
295    }
296}
297
298/// Estimation logic implementing the HyperLogLog algorithm.
299///
300/// This implementation uses 5- or 6-bit registers and [broadword
301/// programming](https://doi.org/10.1145/1963405.1963493). It thus uses the
302/// minimum possible space, saving 37.5 or 25% space with respect to the
303/// [`HyperLogLog8`](crate::impls::HyperLogLog8) logic, which uses 8-bit
304/// registers and byte-wise SIMD operations, but it is significantly slower than
305/// the latter.
306///
307/// The choice between the two logics should be guided by the specific use case
308/// and constraints of your application. Please try the included benchmarks to
309/// have an idea of the difference in performance between the two logics in your
310/// environment.
311///
312/// Instances are created through [`HyperLogLogBuilder`]:
313///
314/// ```
315/// # use card_est_array::impls::HyperLogLogBuilder;
316/// // Default: LogLog-β correction enabled, usize backend
317/// let logic = HyperLogLogBuilder::new(1_000_000)
318///     .log2_num_regs(8)
319///     .build::<String>().unwrap();
320///
321/// // Disable LogLog-β, use classic HyperLogLog + linear-counting fallback
322/// let logic = HyperLogLogBuilder::new(1_000_000)
323///     .log2_num_regs(8)
324///     .beta::<false>()
325///     .build::<String>().unwrap();
326/// ```
327///
328/// # Type parameters
329///
330/// - `T`: the type of elements to count (must implement [`Hash`]).
331///
332/// - `H`: the [`BuildHasher`] used to hash elements.
333///
334/// - `W`: the unsigned word type for the register backend (see below).
335///
336/// - `BETA`: when `true` (the default), the [LogLog-β](beta_horner) bias
337///   correction is used during estimation. This provides better accuracy across
338///   the full cardinality range through a single formula, eliminating the need
339///   for a separate linear-counting correction. The cost is roughly 20ns per
340///   estimate call when some registers are still zero; when all registers are
341///   populated, the correction is skipped and performance is identical to the
342///   classic formula. Set to `false` via [`HyperLogLogBuilder::beta`] to use the
343///   original HyperLogLog formula instead.
344///
345/// # Backend alignment
346///
347/// `W` must be able to represent exactly the backend of an estimator. While
348/// usually `usize` will work (and it is the default type chosen by
349/// [`HyperLogLogBuilder::new`]), with odd register sizes and a small number
350/// of registers it might be necessary to select a smaller type, resulting in
351/// slower merges. For example, using 16 5-bit registers one needs to use
352/// `u16`, whereas for 16 6-bit registers `u32` will be sufficient.
353///
354/// Formally, `W::BITS` must divide `(1 << log2_num_regs) * register_size`
355/// (using [`HyperLogLog::register_size(num_elements)`](HyperLogLog::register_size)).
356/// [`HyperLogLogBuilder::min_log2_num_regs`] returns the minimum value for
357/// `log2_num_regs` that satisfies this property.
358#[derive(Debug, PartialEq)]
359pub struct HyperLogLog<T, H, W = usize, const BETA: bool = true> {
360    build_hasher: H,
361    register_size: usize,
362    num_regs_minus_1: HashResult,
363    log2_num_regs: u32,
364    sentinel_mask: HashResult,
365    num_regs: usize,
366    pub(super) words_per_estimator: usize,
367    alpha_m_m: f64,
368    msb_mask: Box<[W]>,
369    lsb_mask: Box<[W]>,
370    _marker: std::marker::PhantomData<T>,
371}
372
373// We implement Clone manually because we do not want to require that T is
374// Clone.
375impl<T, H: Clone, W: Clone, const BETA: bool> Clone for HyperLogLog<T, H, W, BETA> {
376    fn clone(&self) -> Self {
377        Self {
378            build_hasher: self.build_hasher.clone(),
379            register_size: self.register_size,
380            num_regs_minus_1: self.num_regs_minus_1,
381            log2_num_regs: self.log2_num_regs,
382            sentinel_mask: self.sentinel_mask,
383            num_regs: self.num_regs,
384            words_per_estimator: self.words_per_estimator,
385            alpha_m_m: self.alpha_m_m,
386            msb_mask: self.msb_mask.clone(),
387            lsb_mask: self.lsb_mask.clone(),
388            _marker: std::marker::PhantomData,
389        }
390    }
391}
392
393impl<T, H: Clone, W: Word, const BETA: bool> HyperLogLog<T, H, W, BETA> {
394    /// Returns the base-2 logarithm of the number of registers per estimator.
395    pub fn log2_num_regs(&self) -> u32 {
396        self.log2_num_regs
397    }
398
399    /// Returns the value contained in a register of a given backend.
400    ///
401    /// # Safety
402    ///
403    /// The caller must ensure that `index` is less than the [number of
404    /// registers per estimator](Self::log2_num_regs), and that `backend` has
405    /// length at least [`SliceEstimationLogic::backend_len`].
406    #[inline(always)]
407    unsafe fn get_register_unchecked(&self, backend: impl AsRef<[W]>, index: usize) -> W {
408        let backend = backend.as_ref();
409        let bits = W::BITS as usize;
410        let bit_width = self.register_size;
411        let mask = W::MAX >> (bits - bit_width);
412        let pos = index * bit_width;
413        let word_index = pos / bits;
414        let bit_index = pos % bits;
415
416        if bit_index + bit_width <= bits {
417            (unsafe { *backend.get_unchecked(word_index) } >> bit_index) & mask
418        } else {
419            ((unsafe { *backend.get_unchecked(word_index) } >> bit_index)
420                | (unsafe { *backend.get_unchecked(word_index + 1) } << (bits - bit_index)))
421                & mask
422        }
423    }
424
425    /// Sets the value contained in a register of a given backend.
426    ///
427    /// # Safety
428    ///
429    /// The caller must ensure that `index` is less than the [number of
430    /// registers per estimator](Self::log2_num_regs), and that `backend` has
431    /// length at least [`SliceEstimationLogic::backend_len`].
432    #[inline(always)]
433    unsafe fn set_register_unchecked(
434        &self,
435        mut backend: impl AsMut<[W]>,
436        index: usize,
437        new_value: W,
438    ) {
439        let backend = backend.as_mut();
440        let bits = W::BITS as usize;
441        let bit_width = self.register_size;
442        let mask = W::MAX >> (bits - bit_width);
443        let pos = index * bit_width;
444        let word_index = pos / bits;
445        let bit_index = pos % bits;
446
447        if bit_index + bit_width <= bits {
448            let mut word = unsafe { *backend.get_unchecked_mut(word_index) };
449            word &= !(mask << bit_index);
450            word |= new_value << bit_index;
451            unsafe { *backend.get_unchecked_mut(word_index) = word };
452        } else {
453            let mut word = unsafe { *backend.get_unchecked_mut(word_index) };
454            word &= (W::ONE << bit_index) - W::ONE;
455            word |= new_value << bit_index;
456            unsafe { *backend.get_unchecked_mut(word_index) = word };
457
458            let mut word = unsafe { *backend.get_unchecked_mut(word_index + 1) };
459            word &= !(mask >> (bits - bit_index));
460            word |= new_value >> (bits - bit_index);
461            unsafe { *backend.get_unchecked_mut(word_index + 1) = word };
462        }
463    }
464}
465
466impl<T: Hash, H: BuildHasher + Clone, W: Word, const BETA: bool> SliceEstimationLogic<W>
467    for HyperLogLog<T, H, W, BETA>
468where
469    u32: PrimitiveNumberAs<W>,
470{
471    #[inline(always)]
472    fn backend_len(&self) -> usize {
473        self.words_per_estimator
474    }
475}
476
477impl<T: Hash, H: BuildHasher + Clone, W: Word, const BETA: bool> EstimationLogic
478    for HyperLogLog<T, H, W, BETA>
479where
480    u32: PrimitiveNumberAs<W>,
481{
482    type Item = T;
483    type Backend = [W];
484    type Estimator<'a>
485        = DefaultEstimator<Self, &'a Self, Box<[W]>>
486    where
487        T: 'a,
488        W: 'a,
489        H: 'a;
490
491    fn new_estimator(&self) -> Self::Estimator<'_> {
492        Self::Estimator::new(
493            self,
494            vec![W::ZERO; self.words_per_estimator].into_boxed_slice(),
495        )
496    }
497
498    fn add(&self, backend: &mut Self::Backend, element: impl Borrow<T>) {
499        assert_backend_len!(self, backend);
500        let hash = self.build_hasher.hash_one(element.borrow());
501        let register = (hash & self.num_regs_minus_1) as usize;
502        // The number of trailing zeroes is certainly expressible in
503        // a variable of type W
504        let r = ((hash >> self.log2_num_regs) | self.sentinel_mask)
505            .trailing_zeros()
506            .as_to();
507
508        debug_assert!(r < (W::ONE << self.register_size) - W::ONE);
509        debug_assert!(register < self.num_regs);
510
511        let current_value = unsafe { self.get_register_unchecked(&*backend, register) };
512        let candidate_value = r + W::ONE;
513        let new_value = std::cmp::max(current_value, candidate_value);
514        if current_value != new_value {
515            unsafe { self.set_register_unchecked(backend, register, new_value) };
516        }
517    }
518
519    fn estimate(&self, backend: &[W]) -> f64 {
520        assert_backend_len!(self, backend);
521        let mut harmonic_mean = 0.0;
522        let mut zeroes = 0usize;
523
524        for i in 0..self.num_regs {
525            let value: usize = unsafe { self.get_register_unchecked(backend, i).as_to() };
526            if value == 0 {
527                zeroes += 1;
528            }
529            // 2⁻ᵛ via IEEE 754: exponent = 1023 − v, zero mantissa.
530            debug_assert!(value <= 1023);
531            harmonic_mean += f64::from_bits((1023 - value as u64) << 52);
532        }
533
534        apply_correction::<BETA>(
535            harmonic_mean,
536            zeroes,
537            self.num_regs,
538            self.log2_num_regs,
539            self.alpha_m_m,
540        )
541    }
542
543    #[inline(always)]
544    fn clear(&self, backend: &mut [W]) {
545        backend.fill(W::ZERO);
546    }
547
548    #[inline(always)]
549    fn set(&self, dst: &mut [W], src: &[W]) {
550        debug_assert_eq!(dst.len(), src.len());
551        dst.copy_from_slice(src);
552    }
553}
554
555/// Helper for merge operations with [`HyperLogLog`] logic.
556pub struct HyperLogLogHelper<W> {
557    acc: Vec<W>,
558    mask: Vec<W>,
559}
560
561impl<T: Hash, H: BuildHasher + Clone, W: Word, const BETA: bool> MergeEstimationLogic
562    for HyperLogLog<T, H, W, BETA>
563where
564    u32: PrimitiveNumberAs<W>,
565{
566    type Helper = HyperLogLogHelper<W>;
567
568    fn new_helper(&self) -> Self::Helper {
569        HyperLogLogHelper {
570            acc: vec![W::ZERO; self.words_per_estimator],
571            mask: vec![W::ZERO; self.words_per_estimator],
572        }
573    }
574
575    #[inline(always)]
576    fn merge_with_helper(&self, dst: &mut [W], src: &[W], helper: &mut Self::Helper) {
577        merge_hyperloglog_bitwise(
578            dst,
579            src,
580            self.msb_mask.as_ref(),
581            self.lsb_mask.as_ref(),
582            &mut helper.acc,
583            &mut helper.mask,
584            self.register_size,
585        );
586    }
587}
588
589impl<T, H, W, const BETA: bool> std::fmt::Display for HyperLogLog<T, H, W, BETA> {
590    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
591        write!(
592            f,
593            "HyperLogLog with relative standard deviation: {}% ({} registers/estimator, {} bits/register, {} bytes/estimator)",
594            100.0 * HyperLogLog::rel_std(self.log2_num_regs),
595            self.num_regs,
596            self.register_size,
597            (self.num_regs * self.register_size) / 8
598        )
599    }
600}
601
602/// Builder for [`HyperLogLog`] cardinality-estimation logic.
603///
604/// The builder lets you configure:
605/// - the upper bound on the number of distinct elements
606///   ([`new`](Self::new) / [`num_elements`](Self::num_elements));
607/// - the number of registers, either directly
608///   ([`log2_num_regs`](Self::log2_num_regs)) or via a target relative
609///   standard deviation ([`rsd`](Self::rsd));
610/// - the backend word type ([`word_type`](Self::word_type));
611/// - the hash function ([`build_hasher`](Self::build_hasher));
612/// - whether [LogLog-β bias correction](beta_horner) is enabled
613///   ([`beta`](Self::beta)).
614///
615/// Call [`build`](Self::build) to obtain the configured [`HyperLogLog`]
616/// logic.
617#[derive(Debug, Clone)]
618pub struct HyperLogLogBuilder<H, W = usize, const BETA: bool = true> {
619    build_hasher: H,
620    log2_num_regs: u32,
621    num_elements: usize,
622    _marker: std::marker::PhantomData<W>,
623}
624
625impl HyperLogLogBuilder<BuildHasherDefault<DefaultHasher>> {
626    /// Creates a new builder for a [`HyperLogLog`] logic with the default word
627    /// type (the fixed-size equivalent of `usize`).
628    ///
629    /// # Panics
630    ///
631    /// If `num_elements` is zero.
632    pub const fn new(num_elements: usize) -> Self {
633        assert!(
634            num_elements > 0,
635            "the upper bound on the number of distinct elements must be positive"
636        );
637        Self {
638            build_hasher: BuildHasherDefault::new(),
639            log2_num_regs: 4,
640            num_elements,
641            _marker: std::marker::PhantomData,
642        }
643    }
644}
645
646fn min_alignment(bits: usize) -> String {
647    if bits % 128 == 0 {
648        "u128"
649    } else if bits % 64 == 0 {
650        "u64"
651    } else if bits % 32 == 0 {
652        "u32"
653    } else if bits % 16 == 0 {
654        "u16"
655    } else {
656        "u8"
657    }
658    .to_string()
659}
660
661impl HyperLogLog<(), (), (), true> {
662    /// Returns the logarithm of the number of registers per estimator that are
663    /// necessary to attain a given relative standard deviation.
664    ///
665    /// # Arguments
666    /// * `rsd`: the relative standard deviation to be attained.
667    pub fn log2_num_of_registers(rsd: f64) -> u32 {
668        (((1.106 / rsd).powi(2)).log2().ceil() as u32).max(4)
669    }
670
671    /// Returns the relative standard deviation corresponding to a given number
672    /// of registers per estimator.
673    ///
674    /// # Arguments
675    ///
676    /// * `log2_num_regs`: the logarithm of the number of registers per
677    ///   estimator.
678    pub fn rel_std(log2_num_regs: u32) -> f64 {
679        let tmp = match log2_num_regs {
680            4 => 1.106,
681            5 => 1.070,
682            6 => 1.054,
683            7 => 1.046,
684            _ => 1.04,
685        };
686        tmp / ((1 << log2_num_regs) as f64).sqrt()
687    }
688
689    /// Returns the register size in bits, given an upper bound on the number of
690    /// distinct elements.
691    ///
692    /// # Arguments
693    /// * `num_elements`: an upper bound on the number of distinct elements.
694    pub fn register_size(num_elements: usize) -> usize {
695        std::cmp::max(
696            5,
697            (((num_elements as f64).ln() / LN_2) / LN_2).ln().ceil() as usize,
698        )
699    }
700}
701
702impl<H, W: Word, const BETA: bool> HyperLogLogBuilder<H, W, BETA> {
703    /// Sets the desired relative standard deviation.
704    ///
705    /// This is a high-level alternative to [`Self::log2_num_regs`]. Calling one
706    /// after the other invalidates the work done by the first one.
707    ///
708    /// # Arguments
709    /// * `rsd`: the relative standard deviation to be attained.
710    ///
711    /// # Panics
712    ///
713    /// If the resulting number of registers is less than 16 (i.e., `rsd` is
714    /// too large).
715    pub fn rsd(self, rsd: f64) -> Self {
716        self.log2_num_regs(HyperLogLog::log2_num_of_registers(rsd))
717    }
718
719    /// Sets the base-2 logarithm of the number of registers.
720    ///
721    /// This is a low-level alternative to [`Self::rsd`]. Calling one after the
722    /// other invalidates the work done by the first one.
723    ///
724    /// # Arguments
725    /// * `log2_num_regs`: the logarithm of the number of registers per
726    ///   estimator.
727    ///
728    /// # Panics
729    ///
730    /// If `log2_num_regs` is less than 4.
731    pub const fn log2_num_regs(mut self, log2_num_regs: u32) -> Self {
732        assert!(
733            log2_num_regs >= 4,
734            "the logarithm of the number of registers per estimator should be at least 4"
735        );
736        self.log2_num_regs = log2_num_regs;
737        self
738    }
739
740    /// Returns the minimum value allowed for [`Self::log2_num_regs`] given the
741    /// current value of [`Self::num_elements`].
742    pub fn min_log2_num_regs(&self) -> u32 {
743        let register_size = HyperLogLog::register_size(self.num_elements);
744        let register_size = NonZeroUsize::try_from(register_size).expect("register_size is zero");
745        let min_num_regs = W::BITS / highest_power_of_2_dividing(register_size);
746        assert_eq!(min_num_regs, min_num_regs.next_power_of_two());
747        min_num_regs.trailing_zeros() // log2(min_num_regs)
748    }
749
750    /// Sets the type `W` to use to represent backends.
751    ///
752    /// Note that the returned builder will have a different type if `W2` is
753    /// different from `W`.
754    ///
755    /// See the [`logic documentation`](HyperLogLog) for the limitations on the
756    /// choice of `W2`.
757    pub fn word_type<W2>(self) -> HyperLogLogBuilder<H, W2, BETA> {
758        HyperLogLogBuilder {
759            num_elements: self.num_elements,
760            build_hasher: self.build_hasher,
761            log2_num_regs: self.log2_num_regs,
762            _marker: std::marker::PhantomData,
763        }
764    }
765
766    /// Enables or disables the [LogLog-β bias correction](beta_horner) in
767    /// the estimate.
768    ///
769    /// When enabled (the default), the estimate uses the LogLog-β formula for
770    /// `log_num_regs` 4–18, which provides better accuracy across the full
771    /// cardinality range without a separate linear-counting correction, at the
772    /// cost of roughly 20ns per estimate call when some registers are still
773    /// zero. When all registers are populated, the correction is skipped and
774    /// performance is identical to the classic formula.
775    ///
776    /// When disabled, the classic HyperLogLog formula with linear-counting
777    /// fallback is used.
778    ///
779    /// ```
780    /// # use card_est_array::impls::HyperLogLogBuilder;
781    /// // Disable LogLog-β correction
782    /// let logic = HyperLogLogBuilder::new(1_000_000)
783    ///     .log2_num_regs(8)
784    ///     .beta::<false>()
785    ///     .build::<usize>().unwrap();
786    /// ```
787    pub fn beta<const BETA2: bool>(self) -> HyperLogLogBuilder<H, W, BETA2> {
788        HyperLogLogBuilder {
789            num_elements: self.num_elements,
790            build_hasher: self.build_hasher,
791            log2_num_regs: self.log2_num_regs,
792            _marker: std::marker::PhantomData,
793        }
794    }
795
796    /// Sets the upper bound on the number of elements.
797    ///
798    /// # Panics
799    ///
800    /// If `n` is zero.
801    pub const fn num_elements(mut self, num_elements: usize) -> Self {
802        assert!(
803            num_elements > 0,
804            "the upper bound on the number of distinct elements must be positive"
805        );
806        self.num_elements = num_elements;
807        self
808    }
809
810    /// Sets the [`BuildHasher`] to use.
811    ///
812    /// Using this method you can select a specific hasher based on one or more
813    /// seeds.
814    pub fn build_hasher<H2>(self, build_hasher: H2) -> HyperLogLogBuilder<H2, W, BETA> {
815        HyperLogLogBuilder {
816            num_elements: self.num_elements,
817            log2_num_regs: self.log2_num_regs,
818            build_hasher,
819            _marker: std::marker::PhantomData,
820        }
821    }
822
823    /// Builds the logic.
824    ///
825    /// The type of objects the estimators keep track of is defined here by `T`,
826    /// but it is usually inferred by the compiler.
827    ///
828    /// # Errors
829    ///
830    /// Returns [`HyperLogLogError::RegisterSizeTooLarge`] if the register size
831    /// derived from `num_elements` exceeds the maximum supported by the hash
832    /// type.
833    ///
834    /// Returns [`HyperLogLogError::UnalignedBackend`] if the estimator size in
835    /// bits is not divisible by the bit width of `W`.
836    pub fn build<T>(self) -> Result<HyperLogLog<T, H, W, BETA>, HyperLogLogError> {
837        let bits = W::BITS as usize;
838        let log2_num_regs = self.log2_num_regs;
839        let num_elements = self.num_elements;
840        let number_of_registers = 1 << log2_num_regs;
841        let register_size = HyperLogLog::register_size(num_elements);
842        if register_size > 6 {
843            return Err(HyperLogLogError::RegisterSizeTooLarge {
844                register_size,
845                num_elements,
846                hash_bits: HashResult::BITS,
847            });
848        }
849        let sentinel_mask = 1 << ((1 << register_size) - 2);
850        let alpha = match log2_num_regs {
851            4 => 0.673,
852            5 => 0.697,
853            6 => 0.709,
854            _ => 0.7213 / (1.0 + 1.079 / number_of_registers as f64),
855        };
856        let num_regs_minus_1 = (number_of_registers - 1) as HashResult;
857
858        let est_size_in_bits = number_of_registers * register_size;
859
860        // This ensures estimators are always aligned to W
861        if est_size_in_bits % bits != 0 {
862            return Err(HyperLogLogError::UnalignedBackend {
863                est_size_in_bits,
864                word_bits: bits,
865                min_alignment: min_alignment(est_size_in_bits),
866                min_log2_num_regs: self.min_log2_num_regs(),
867            });
868        }
869        let est_size_in_words = est_size_in_bits / bits;
870
871        let msb_mask = build_register_mask(
872            est_size_in_words,
873            register_size,
874            W::ONE << (register_size - 1),
875        );
876        let lsb_mask = build_register_mask(est_size_in_words, register_size, W::ONE);
877
878        Ok(HyperLogLog {
879            num_regs: number_of_registers,
880            num_regs_minus_1,
881            log2_num_regs,
882            register_size,
883            alpha_m_m: alpha * (number_of_registers as f64).powi(2),
884            sentinel_mask,
885            build_hasher: self.build_hasher,
886            msb_mask,
887            lsb_mask,
888            words_per_estimator: est_size_in_words,
889            _marker: std::marker::PhantomData,
890        })
891    }
892}
893
894/// Builds a mask of `num_words` words by repeating a `register_size`-bit
895/// pattern across all register positions.
896fn build_register_mask<W: Word>(num_words: usize, register_size: usize, pattern: W) -> Box<[W]> {
897    let bits = W::BITS as usize;
898    let total_bits = num_words * bits;
899    let mut result = vec![W::ZERO; num_words];
900    let mut bit_pos = 0;
901    while bit_pos < total_bits {
902        let word_index = bit_pos / bits;
903        let bit_index = bit_pos % bits;
904        result[word_index] |= pattern << bit_index;
905        if bit_index + register_size > bits && word_index + 1 < num_words {
906            result[word_index + 1] |= pattern >> (bits - bit_index);
907        }
908        bit_pos += register_size;
909    }
910    result.into_boxed_slice()
911}
912
913/// Performs a multiple precision subtraction, leaving the result in the first operand.
914/// The operands MUST have the same length.
915///
916/// # Arguments
917/// * `x`: the first operand. This will contain the final result.
918/// * `y`: the second operand that will be subtracted from `x`.
919#[inline(always)]
920pub(super) fn subtract<W: Word>(x: &mut [W], y: &[W]) {
921    debug_assert_eq!(x.len(), y.len());
922    let mut borrow = false;
923
924    for (x_word, &y) in x.iter_mut().zip(y.iter()) {
925        let mut x = *x_word;
926        if !borrow {
927            borrow = x < y;
928        } else if x != W::ZERO {
929            x = x.wrapping_sub(W::ONE);
930            borrow = x < y;
931        } else {
932            x = x.wrapping_sub(W::ONE);
933        }
934        *x_word = x.wrapping_sub(y);
935    }
936}
937
938fn merge_hyperloglog_bitwise<W: Word>(
939    mut x: impl AsMut<[W]>,
940    y: impl AsRef<[W]>,
941    msb_mask: impl AsRef<[W]>,
942    lsb_mask: impl AsRef<[W]>,
943    acc: &mut Vec<W>,
944    mask: &mut Vec<W>,
945    register_size: usize,
946) {
947    let x = x.as_mut();
948    let y = y.as_ref();
949    let msb_mask = msb_mask.as_ref();
950    let lsb_mask = lsb_mask.as_ref();
951
952    debug_assert_eq!(x.len(), y.len());
953    debug_assert_eq!(x.len(), msb_mask.len());
954    debug_assert_eq!(x.len(), lsb_mask.len());
955
956    let register_size_minus_1 = register_size - 1;
957    let num_words_minus_1 = x.len() - 1;
958    let shift_register_size_minus_1 = W::BITS as usize - register_size_minus_1;
959
960    acc.clear();
961    mask.clear();
962
963    /* We work in two phases. Let H_r (msb_mask) be the mask with the
964     * highest bit of each register (of size r) set, and L_r (lsb_mask)
965     * be the mask with the lowest bit of each register set.
966     * We describe the algorithm on a single word.
967     *
968     * In the first phase we perform an unsigned strict register-by-register
969     * comparison of x and y, using the formula
970     *
971     * z = ((((y | H_r) - (x & !H_r)) | (y ^ x)) ^ (y | !x)) & H_r
972     *
973     * Then, we generate a register-by-register mask of all ones or
974     * all zeroes, depending on the result of the comparison, using the
975     * formula
976     *
977     * (((z >> r-1 | H_r) - L_r) | H_r) ^ z
978     *
979     * At that point, it is trivial to select from x and y the right values.
980     */
981
982    // We load y | H_r into the accumulator.
983    acc.extend(
984        y.iter()
985            .zip(msb_mask)
986            .map(|(&y_word, &msb_word)| y_word | msb_word),
987    );
988
989    // We load x & !H_r into mask as temporary storage.
990    mask.extend(
991        x.iter()
992            .zip(msb_mask)
993            .map(|(&x_word, &msb_word)| x_word & !msb_word),
994    );
995
996    // We subtract x & !H_r, using mask as temporary storage
997    subtract(acc, mask);
998
999    // We OR with y ^ x, XOR with (y | !x), and finally AND with H_r.
1000    acc.iter_mut()
1001        .zip(x.iter())
1002        .zip(y.iter())
1003        .zip(msb_mask.iter())
1004        .for_each(|(((acc_word, &x_word), &y_word), &msb_word)| {
1005            *acc_word = ((*acc_word | (y_word ^ x_word)) ^ (y_word | !x_word)) & msb_word
1006        });
1007
1008    // We shift by register_size - 1 places and put the result into mask.
1009    {
1010        let (mask_last, mask_slice) = mask.split_last_mut().unwrap();
1011        let (&msb_last, msb_slice) = msb_mask.split_last().unwrap();
1012        mask_slice
1013            .iter_mut()
1014            .zip(acc[0..num_words_minus_1].iter())
1015            .zip(acc[1..].iter())
1016            .zip(msb_slice.iter())
1017            .rev()
1018            .for_each(|(((mask_word, &acc_word), &next_acc_word), &msb_word)| {
1019                // W is always unsigned so the shift is always with a 0
1020                *mask_word = (acc_word >> register_size_minus_1)
1021                    | (next_acc_word << shift_register_size_minus_1)
1022                    | msb_word
1023            });
1024        *mask_last = (acc[num_words_minus_1] >> register_size_minus_1) | msb_last;
1025    }
1026
1027    // We subtract L_r from mask.
1028    subtract(mask, lsb_mask);
1029
1030    // We OR with H_r and XOR with the accumulator.
1031    mask.iter_mut()
1032        .zip(msb_mask.iter())
1033        .zip(acc.iter())
1034        .for_each(|((mask_word, &msb_word), &acc_word)| {
1035            *mask_word = (*mask_word | msb_word) ^ acc_word
1036        });
1037
1038    // Finally, we use mask to select the right bits from x and y and store the result.
1039    x.iter_mut()
1040        .zip(y.iter())
1041        .zip(mask.iter())
1042        .for_each(|((x_word, &y_word), &mask_word)| {
1043            *x_word = *x_word ^ ((*x_word ^ y_word) & mask_word);
1044        });
1045}
1046
1047fn highest_power_of_2_dividing(n: NonZeroUsize) -> u32 {
1048    1 << n.trailing_zeros()
1049}
1050
1051#[test]
1052fn test_highest_power_of_2_dividing() {
1053    let powers_of_2: Vec<_> = (1..=20)
1054        .map(|n| highest_power_of_2_dividing(n.try_into().unwrap()))
1055        .collect();
1056    assert_eq!(
1057        powers_of_2,
1058        vec![1, 2, 1, 4, 1, 2, 1, 8, 1, 2, 1, 4, 1, 2, 1, 16, 1, 2, 1, 4]
1059    );
1060}