Skip to main content

rust_hll/
settings.rs

1use thiserror::Error;
2
3use crate::utils::divide_by_8_round_up;
4
5// minimum and maximum values for the log-base-2 of the number of registers
6// in the HLL
7const MINIMUM_LOG_2M_PARAM: u32 = 4;
8const MAXIMUM_LOG_2M_PARAM: u32 = 31;
9
10// minimum and maximum values for the register width of the HLL
11const MINIMUM_REG_WIDTH_PARAM: u32 = 1;
12const MAXIMUM_REG_WIDTH_PARAM: u32 = 8;
13
14// minimum and maximum values for the 'expthresh' parameter of the
15// constructor that is meant to match the PostgreSQL
16// implementation's constructor and parameter names
17const MINIMUM_EXPTHRESH_PARAM: i32 = -1;
18const MAXIMUM_EXPTHRESH_PARAM: i32 = 18;
19const MAXIMUM_EXPLICIT_THRESHOLD: u32 = 1 << (MAXIMUM_EXPTHRESH_PARAM - 1); /*per storage spec*/
20
21// AutoExplicitThreshold indicates that the threshold at which an Hll goes
22// from using an explicit to a probabalistic representation should be
23// calculated based on the configuration.  Using the calculated threshold is
24// generally preferable.  One exception would be working with a pre-existing
25// data set that uses a particular explicit threshold setting in which case
26// it may be desirable to use the same explicit threshold.
27const AUTO_EXPLICIT_THRESHOLD: i32 = -1;
28
29/// Settings are used to configure the Hll and how it transitions between the
30/// backing storage types.
31#[derive(Copy, Clone, Debug, PartialEq)]
32pub struct Settings {
33    /// log_2m determines the number of registers in the Hll.  The minimum value
34    /// is 4 and the maximum value is 31.  The number of registers in the Hll
35    /// will be calculated as 2^log_2m.
36    pub(crate) log_2m: u32,
37
38    /// reg_width is the number of bits dedicated to each register value.  The
39    /// minimum value is 1 and the maximum value is 8.
40    pub(crate) reg_width: u32,
41
42    /// ExplicitThreshold is the cardinality at which the Hll will go from
43    /// storing explicit values to using a probabilistic model.  A value of 0
44    /// disables explicit storage entirely.  The value AutoExplicitThreshold can
45    /// be used to signal the library to calculate an appropriate threshold
46    /// (recommended).  The maximum allowed value is 131,072.
47    pub(crate) explicit_threshold: i32,
48
49    /// SparseEnabled controls whether the Hll will use the sparse
50    /// representation.  The thresholds for conversion are automatically
51    /// calculated by the library when this field is set to true (recommended).
52    pub(crate) sparse_threshold: Option<i32>,
53
54    /// pw_max_mask is a mask that prevents overflow of HyperLogLog registers.
55    pub(crate) pw_max_mask: u64,
56
57    /// m_bits_mask is a precomputed mask where the bottom-most reg_width bits are set.
58    pub(crate) m_bits_mask: u64,
59    /// alpha * m^2 (the constant in the "'raw' HyperLogLog estimator")
60    pub(crate) alpha_msquared: f64,
61
62    /// small_estimator_cutoff is the cutoff value of the estimator for using the
63    /// "small" range cardinality correction formula
64    pub(crate) small_estimator_cutoff: f64,
65
66    /// large_estimator_cutoff is the cutoff value of the estimator for using the
67    /// "large" range cardinality correction formula
68    pub(crate) large_estimator_cutoff: f64,
69    pub(crate) two_to_l: f64,
70}
71
72#[derive(Clone, Debug, Error)]
73pub enum SettingsError {
74    #[error("log_2m must be between {MINIMUM_LOG_2M_PARAM}, {MAXIMUM_LOG_2M_PARAM}")]
75    Log2m,
76    #[error("reg_width must be between {MINIMUM_REG_WIDTH_PARAM}, {MAXIMUM_REG_WIDTH_PARAM}")]
77    RegWidth,
78    #[error("Threshold must be between {MINIMUM_EXPTHRESH_PARAM}, {MAXIMUM_EXPTHRESH_PARAM}")]
79    Threshold,
80    #[error("config mismatch. log_2m and reg_width must match when combining hll's")]
81    MisMatch,
82}
83
84impl Settings {
85    pub fn new(
86        log_2m: u32,
87        reg_width: u32,
88        explicit_threshold: i32,
89        sparse_enabled: bool,
90    ) -> Result<Self, SettingsError> {
91        let sparse_threshold = match sparse_enabled {
92            true => Some(Self::calculate_sparse_threshold(log_2m, reg_width)),
93            false => None,
94        };
95
96        let settings = Self {
97            log_2m,
98            reg_width,
99            explicit_threshold,
100            sparse_threshold,
101            pw_max_mask: Settings::pw_max_mask(reg_width),
102            m_bits_mask: ((1 << log_2m) - 1),
103            alpha_msquared: Settings::alpha_m_squared(log_2m),
104            small_estimator_cutoff: Settings::small_estimator_cutoff(1 << log_2m),
105            large_estimator_cutoff: Settings::large_estimator_cutoff(Settings::two_to_l(
106                log_2m, reg_width,
107            )),
108            two_to_l: Settings::two_to_l(log_2m, reg_width),
109        };
110
111        settings.validate()?;
112
113        Ok(settings)
114    }
115
116    pub fn validate(&self) -> Result<(), SettingsError> {
117        if !(MINIMUM_LOG_2M_PARAM..=MAXIMUM_LOG_2M_PARAM).contains(&self.log_2m) {
118            return Err(SettingsError::Log2m);
119        }
120
121        if !(MINIMUM_REG_WIDTH_PARAM..=MAXIMUM_REG_WIDTH_PARAM).contains(&self.reg_width) {
122            return Err(SettingsError::RegWidth);
123        }
124
125        // the unit tests are using 256 for their threshold which is larger than what the constructor in golang allows.
126        // if !(MINIMUM_EXPTHRESH_PARAM..=MAXIMUM_EXPTHRESH_PARAM).contains(&self.explicit_threshold) {
127        //     return Err(SettingsError::Threshold);
128        // }
129
130        Ok(())
131    }
132
133    pub fn settings_check(&self, other: &Self) -> Result<(), SettingsError> {
134        if self.log_2m == other.log_2m && self.reg_width == other.reg_width {
135            return Ok(());
136        }
137
138        Err(SettingsError::MisMatch)
139    }
140
141    pub fn explicit_threshold(&self) -> u32 {
142        match self.explicit_threshold {
143            AUTO_EXPLICIT_THRESHOLD => {
144                Self::calculate_explicit_threshold(self.log_2m, self.reg_width)
145            }
146            _ => self.explicit_threshold as u32,
147        }
148    }
149
150    /// determines a good cutoff to switch between explicit and probabilistic storage.
151    pub fn calculate_explicit_threshold(log_2m: u32, reg_width: u32) -> u32 {
152        // NOTE:  This math matches the size calculation in the PostgreSQL impl.
153        let m = 1 << log_2m;
154        let full_representation_size = divide_by_8_round_up(reg_width * m); /*round up to next whole byte*/
155        let num_longs = full_representation_size / 8;
156
157        if num_longs > MAXIMUM_EXPLICIT_THRESHOLD {
158            return MAXIMUM_EXPLICIT_THRESHOLD;
159        }
160
161        num_longs
162    }
163
164    // calculate_sparse_threshold determines a good cutoff to switch between sparse
165    // and dense probabilistic storage.
166    fn calculate_sparse_threshold(log_2m: u32, reg_width: u32) -> i32 {
167        let m = 1 << log_2m;
168        let short_word_length: f64 = (log_2m + reg_width).into();
169
170        let reg_bits: f64 = (m * reg_width).into();
171
172        let largest_pow2_less_than_cutoff: u32 = (reg_bits / short_word_length).log2() as u32;
173
174        1 << largest_pow2_less_than_cutoff
175    }
176
177    // pw_max_mask calculates the mask that is used to prevent overflow of HyperLogLog
178    // registers.
179    pub(crate) fn pw_max_mask(reg_width: u32) -> u64 {
180        let shift: u64 = (((1u64 << reg_width) - 1) - 1) % (u64::BITS as u64);
181        !((1u64 << shift) - 1)
182    }
183
184    /// alpha_m_squared calculates the 'alpha-m-squared' constant (gamma times
185    /// registerCount squared where gamma is based on the value of registerCount)
186    /// used by the HyperLogLog algorithm.
187    pub(crate) fn alpha_m_squared(log_2m: u32) -> f64 {
188        let m: f64 = (1 << log_2m).into();
189
190        match log_2m {
191            4 => 0.673 * m * m,
192            5 => 0.697 * m * m,
193            6 => 0.709 * m * m,
194            _ => (0.7213 / (1.0 + 1.079 / m)) * m * m,
195        }
196    }
197
198    // small_estimator_cutoff calculates the "small range correction" formula, in the
199    // HyperLogLog algorith based on the total number of registers (m)
200    pub(crate) fn small_estimator_cutoff(m: u32) -> f64 {
201        let m: f64 = m.into();
202        (m * 5.0) / 2.0
203    }
204
205    // large_estimator_cutoff calculates The cutoff for using the "large range
206    // correction" formula, from the HyperLogLog algorithm, adapted for 64 bit
207    // hashes.  See http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll.
208    pub(crate) fn large_estimator_cutoff(two_to_l: f64) -> f64 {
209        two_to_l / 30.0
210    }
211
212    // two_to_l calculates 2 raised to L where L is the "large range correction
213    // boundary" described at http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll.
214    pub(crate) fn two_to_l(log_2m: u32, reg_width: u32) -> f64 {
215        let max_register_value = (1 << reg_width) - 1;
216
217        // Since 1 is added to p(w) in the insertion algorithm, only
218        // (max_register_value - 1) bits are inspected hence the hash
219        // space is one power of two smaller.
220        let pw_bits = max_register_value - 1;
221        let total_bits = pw_bits + log_2m;
222
223        // NOTE : this can get larger than fits in a 64 bit integer.
224        2_f64.powf(total_bits.into())
225    }
226
227    pub(crate) fn pack_cutoff_byte(&self) -> u8 {
228        let threshold = if self.explicit_threshold == AUTO_EXPLICIT_THRESHOLD {
229            63
230        } else if self.explicit_threshold == 0 {
231            0
232        } else {
233            u32::BITS - (self.explicit_threshold as u32).leading_zeros() - 1
234        };
235
236        let mut res = threshold;
237        if self.sparse_threshold.is_some() {
238            res |= 1 << 6
239        }
240
241        res as u8
242    }
243
244    /// (sparse_enabled, explicit_threshold)
245    pub(crate) fn unpack_cutoff_byte(b: u8) -> (bool, i32) {
246        let sparse_enabled = b >> 6 == 1;
247        let threshold = b & 0x3F;
248
249        if threshold == 0 {
250            return (sparse_enabled, 0);
251        }
252
253        if threshold == 63 {
254            return (sparse_enabled, -1);
255        }
256
257        (sparse_enabled, 1 << (threshold - 1))
258    }
259}
260
261#[cfg(test)]
262mod test {
263    use super::Settings;
264
265    #[test]
266    fn pw() {
267        let settings = Settings::new(10, 5, 0, false);
268
269        println!("{:?}", settings);
270    }
271
272    #[test]
273    fn left_shift() {
274        assert_eq!(1 << 0, 1);
275    }
276}