hyperloglog_rs/utils.rs
1//! # Utils
2//!
3//! This module provides utility functions used by the HyperLogLog algorithm implementation.
4//!
5//! The functions provided are:
6//!
7//! - `ceil(numerator: usize, denominator: usize) -> usize`: Calculates the integer ceil of the division
8//! of `numerator` by `denominator`.
9//!
10//! - `word_from_registers<const NUMBER_OF_BITS_PER_REGISTER: usize>(registers: &[u32]) -> u32`: Converts an array
11//! of HLL registers into a single 32-bit word.
12//!
13//!
14use crate::log::log;
15
16#[inline]
17/// Calculates the integer ceil of the division of `numerator` by `denominator`.
18///
19/// # Examples
20///
21/// Test with evenly divisible values:
22/// ```rust
23/// # use hyperloglog_rs::utils::ceil;
24/// assert_eq!(ceil(10, 5), 2);
25/// assert_eq!(ceil(25, 5), 5);
26/// assert_eq!(ceil(100, 10), 10);
27/// ```
28///
29/// Test with values that require rounding up:
30/// ```
31/// # use hyperloglog_rs::utils::ceil;
32/// assert_eq!(ceil(3, 2), 2);
33/// assert_eq!(ceil(7, 3), 3);
34/// assert_eq!(ceil(11, 4), 3);
35/// assert_eq!(ceil(100, 7), 15);
36/// ```
37///
38pub const fn ceil(numerator: usize, denominator: usize) -> usize {
39 (numerator + denominator - 1) / denominator
40}
41
42/// Precomputes small corrections for HyperLogLog algorithm.
43///
44/// This function calculates a correction factor for each register that helps to improve the
45/// accuracy of the HyperLogLog algorithm. The corrections are stored in an array and can be
46/// accessed for later use by other methods of the HyperLogLog struct.
47///
48/// # Arguments
49/// * `NUMBER_OF_REGISTERS` - The number of registers used in the HyperLogLog algorithm.
50///
51/// # Examples
52/// ```
53/// use hyperloglog_rs::utils::precompute_linear_counting;
54/// use hyperloglog_rs::log::log;
55///
56/// const NUMBER_OF_REGISTERS: usize = 16;
57/// let small_corrections = precompute_linear_counting::<NUMBER_OF_REGISTERS>();
58/// assert_eq!(small_corrections.len(), NUMBER_OF_REGISTERS);
59/// assert_eq!(small_corrections[0], NUMBER_OF_REGISTERS as f32 * log(NUMBER_OF_REGISTERS as f64) as f32);
60/// assert_eq!(small_corrections[1], NUMBER_OF_REGISTERS as f32 * log(NUMBER_OF_REGISTERS as f64 / 2.0_f64) as f32);
61/// ```
62pub const fn precompute_linear_counting<const NUMBER_OF_REGISTERS: usize>(
63) -> [f32; NUMBER_OF_REGISTERS] {
64 let mut small_corrections = [0_f32; NUMBER_OF_REGISTERS];
65 let mut i = 0;
66 // We can skip the last value in the small range correction array, because it is always 0.
67 while i < NUMBER_OF_REGISTERS - 1 {
68 small_corrections[i] =
69 (NUMBER_OF_REGISTERS as f64 * log(NUMBER_OF_REGISTERS as f64 / (i + 1) as f64)) as f32;
70 i += 1;
71 }
72 small_corrections
73}
74
75/// Computes the alpha constant for the given number of registers.
76///
77/// The alpha constant is used to scale the raw HyperLogLog estimate into an
78/// estimate of the true cardinality of the set.
79///
80/// # Arguments
81/// * `NUMBER_OF_REGISTERS`: The number of registers in the HyperLogLog
82/// data structure.
83///
84/// # Returns
85/// The alpha constant for the given number of registers.
86///
87/// # Examples
88///
89/// ```
90/// # use hyperloglog_rs::utils::get_alpha;
91///
92/// let alpha_16 = get_alpha(16);
93/// let alpha_32 = get_alpha(32);
94/// let alpha_64 = get_alpha(64);
95///
96/// assert_eq!(alpha_16, 0.673);
97/// assert_eq!(alpha_32, 0.697);
98/// assert_eq!(alpha_64, 0.709);
99///
100/// let alpha_4096 = get_alpha(4096);
101///
102/// assert_eq!(alpha_4096, 0.7213 / (1.0 + 1.079 / 4096.0));
103/// ```
104#[inline(always)]
105pub const fn get_alpha(number_of_registers: usize) -> f32 {
106 // Match the number of registers to the known alpha values
107 match number_of_registers {
108 16 => 0.673,
109 32 => 0.697,
110 64 => 0.709,
111 _ => 0.7213 / (1.0 + 1.079 / number_of_registers as f32),
112 }
113}
114
115#[inline]
116/// Returns an empirically determined threshold to decide on
117/// the use of linear counting.
118///
119/// # Arguments
120/// * `precision`: The precision of the HyperLogLog algorithm.
121///
122/// # References
123/// This data is made available by the authors of the paper
124/// in [this Google Docs document](https://docs.google.com/document/d/1gyjfMHy43U9OWBXxfaeG-3MjGzejW1dlpyMwEYAAWEI/view?fullscreen).
125///
126/// # Examples
127///
128/// ```rust
129/// # use hyperloglog_rs::utils::linear_counting_threshold;
130///
131/// assert_eq!(linear_counting_threshold(4), 10.0);
132/// assert_eq!(linear_counting_threshold(5), 20.0);
133/// assert_eq!(linear_counting_threshold(6), 40.0);
134/// assert_eq!(linear_counting_threshold(7), 80.0);
135/// ```
136pub const fn linear_counting_threshold(precision: usize) -> f32 {
137 match precision {
138 4 => 10.0,
139 5 => 20.0,
140 6 => 40.0,
141 7 => 80.0,
142 8 => 220.0,
143 9 => 400.0,
144 10 => 900.0,
145 11 => 1800.0,
146 12 => 3100.0,
147 13 => 6500.0,
148 14 => 11500.0,
149 15 => 20000.0,
150 16 => 50000.0,
151 17 => 120000.0,
152 18 => 350000.0,
153 // The documentation for the HyperLogLog algorithm only provides empirically determined thresholds for precisions from 4 to 18.
154 _ => unreachable!(),
155 }
156}