bitbelay_tests/correlation/
bitwise.rs

1//! Bitwise correlation test.
2
3use std::collections::HashMap;
4use std::hash::BuildHasher;
5use std::hash::Hasher as _;
6use std::num::NonZeroUsize;
7
8use bitbelay_providers::Provider;
9use bitbelay_report::section;
10use bitbelay_report::section::test;
11use bitbelay_report::section::test::module;
12use bitbelay_statistics::correlation::pearson;
13use colored::Colorize as _;
14use ordered_float::OrderedFloat;
15use tracing::debug;
16use tracing::info;
17
18/// Results from a bitwise correlation test.
19pub type Results = HashMap<(usize, usize), Option<f64>>;
20
21/// A bitwise correlation test.
22#[derive(Debug)]
23pub struct Test<'a, H: BuildHasher, const N: usize> {
24    /// The build hasher.
25    build_hasher: &'a H,
26
27    /// The bit values accumulated for each bit in the output hash.
28    bit_values: [Vec<f64>; N],
29
30    /// The threshold of correlation at which any non-diagonal value causes the
31    /// test to fail.
32    threshold: f64,
33}
34
35impl<'a, H: BuildHasher, const N: usize> Test<'a, H, N> {
36    /// Creates a new [`Test`].
37    ///
38    /// # Examples
39    ///
40    /// ```
41    /// use std::hash::RandomState;
42    ///
43    /// use bitbelay_tests::correlation::bitwise::Test;
44    ///
45    /// let hasher = RandomState::new();
46    /// let test = Test::<RandomState, 64>::new(&hasher, 0.05);
47    /// ```
48    pub fn new(build_hasher: &'a H, threshold: f64) -> Self {
49        Self {
50            build_hasher,
51            bit_values: [(); N].map(|_| Vec::new()),
52            threshold,
53        }
54    }
55
56    /// Gets the bit values of this [`Test`] by reference.
57    ///
58    /// # Examples
59    ///
60    /// ```
61    /// use std::hash::RandomState;
62    /// use std::num::NonZeroUsize;
63    ///
64    /// use bitbelay_providers::ascii::AlphanumericProvider;
65    /// use bitbelay_providers::Provider;
66    /// use bitbelay_tests::correlation::bitwise::Test;
67    ///
68    /// let mut provider: Box<dyn Provider> = Box::new(AlphanumericProvider::new(10));
69    /// let hasher = RandomState::new();
70    /// let mut test = Test::<RandomState, 64>::new(&hasher, 0.05);
71    ///
72    /// test.run(&mut provider, NonZeroUsize::try_from(10).unwrap());
73    ///
74    /// assert_eq!(test.bit_values()[0].len(), 10);
75    /// ```
76    pub fn bit_values(&self) -> &[Vec<f64>; N] {
77        &self.bit_values
78    }
79
80    /// Gets the threshold of this [`Test`] by reference.
81    ///
82    /// # Examples
83    ///
84    /// ```
85    /// use std::hash::RandomState;
86    ///
87    /// use bitbelay_tests::correlation::bitwise::Test;
88    ///
89    /// let hasher = RandomState::new();
90    /// let mut test = Test::<RandomState, 64>::new(&hasher, 0.05);
91    ///
92    /// assert_eq!(test.threshold(), 0.05);
93    /// ```
94    pub fn threshold(&self) -> f64 {
95        self.threshold
96    }
97
98    /// Runs a set of iterations using a [`Provider`].
99    ///
100    /// # Examples
101    ///
102    /// ```
103    /// use std::hash::RandomState;
104    /// use std::num::NonZeroUsize;
105    ///
106    /// use bitbelay_providers::ascii::AlphanumericProvider;
107    /// use bitbelay_providers::numeric::Unsigned64BitProvider;
108    /// use bitbelay_providers::Provider;
109    /// use bitbelay_tests::correlation::bitwise::Test;
110    ///
111    /// let mut alphas: Box<dyn Provider> = Box::new(AlphanumericProvider::new(10));
112    /// let mut numbers: Box<dyn Provider> = Box::new(Unsigned64BitProvider::new(10));
113    ///
114    /// let hasher = RandomState::new();
115    /// let mut test = Test::<RandomState, 64>::new(&hasher, 0.05);
116    ///
117    /// test.run(&mut alphas, NonZeroUsize::try_from(10).unwrap());
118    /// test.run(&mut numbers, NonZeroUsize::try_from(10).unwrap());
119    ///
120    /// assert_eq!(test.bit_values()[0].len(), 20);
121    /// ```
122    pub fn run(&mut self, provider: &mut Box<dyn Provider>, iterations: NonZeroUsize) {
123        let hashes = compute_hashes(self.build_hasher, provider, iterations);
124        let newly_computed_bit_values = extract_bit_values_from_hashes::<u64, N>(&hashes);
125
126        for (i, values) in newly_computed_bit_values.iter().enumerate() {
127            // SAFETY: the length of the `newly_computed_bit_values` array is statically
128            // guarenteed to be `N`, which is the same as the size of `self.bit_values`.
129            // As such, this indexing will always succeed.
130            self.bit_values[i].extend(values);
131        }
132    }
133
134    /// Gets the [`Results`] of a [`Test`] using the [`Test`]'s current interal
135    /// state.
136    ///
137    /// # Examples
138    ///
139    /// ```
140    /// use std::hash::RandomState;
141    /// use std::num::NonZeroUsize;
142    ///
143    /// use bitbelay_providers::ascii::AlphanumericProvider;
144    /// use bitbelay_providers::Provider;
145    /// use bitbelay_tests::correlation::bitwise::Test;
146    ///
147    /// let mut provider: Box<dyn Provider> = Box::new(AlphanumericProvider::new(10));
148    /// let hasher = RandomState::new();
149    /// let mut test = Test::<RandomState, 64>::new(&hasher, 0.05);
150    ///
151    /// test.run(&mut provider, NonZeroUsize::try_from(10).unwrap());
152    ///
153    /// let results = test.results().unwrap();
154    /// // Do something with the results.
155    /// ```
156    pub fn results(&self) -> Option<Results> {
157        info!("Computing Pearson correlations for each bit-bit mapping.");
158
159        // SAFETY: there should always be at least one output bit, so this should always
160        // unwrap.
161        if self.bit_values.first().unwrap().is_empty() {
162            return None;
163        }
164
165        let mut results = HashMap::new();
166
167        for i in 0..N {
168            for j in 0..N {
169                // SAFETY: we checked above that there was at least test iteration run. As
170                // such, this should always unwrap.
171                let correlation = pearson::correlation(&self.bit_values[i], &self.bit_values[j]);
172                results.insert((i, j), correlation);
173            }
174
175            debug!("Computed all bit-bit correlations for bit index {}.", i);
176        }
177
178        Some(results)
179    }
180}
181
182impl<'a, H: BuildHasher, const N: usize> crate::r#trait::Test for Test<'a, H, N> {
183    fn title(&self) -> &'static str {
184        "Bitwise Correlation"
185    }
186
187    fn report_section(&self) -> bitbelay_report::section::Test {
188        // SAFETY: there should always be at least one output bit, so this should always
189        // unwrap.
190        if self.bit_values.first().unwrap().is_empty() {
191            panic!("a report can only be generated when at least one test has been run!");
192        }
193
194        let mut correlations = self
195            .results()
196            // SAFETY: we checked above that there was at least test iteration run. As
197            // such, this should always unwrap.
198            .unwrap()
199            .into_iter()
200            .filter(|((i, j), _)| i != j)
201            .map(|(pos, corr)| {
202                (
203                    pos,
204                    OrderedFloat(match corr {
205                        Some(corr) => corr.abs(),
206                        None => panic!(
207                            "one of the correlations was [`None`], which means that there was \
208                             some issue computing the correlation metrics. This is especially \
209                             possible when there were not many iterations run: for example, if \
210                             every bit in one of the arrays was a 0. We recommend trying the test \
211                             again with more samples. If this continue to happen, please file an \
212                             issue."
213                        ),
214                    }),
215                )
216            })
217            .collect::<Vec<_>>();
218        correlations.sort_by(|(_, a), (_, b)| b.cmp(a));
219
220        let (result, mut details) = if correlations
221            .iter()
222            .any(|(_, correlation)| correlation.into_inner() >= self.threshold)
223        {
224            (
225                module::Result::Fail,
226                String::from(
227                    "One or more non-diagonals had a correlation greater than or equal to the \
228                     threshold.\n\n",
229                ),
230            )
231        } else {
232            (
233                module::Result::Pass,
234                String::from("All non-diagonals had a correlation lower than the threshold.\n\n"),
235            )
236        };
237
238        details.push_str("Maxmium correlation values:\n");
239
240        correlations
241            .iter()
242            .take(10)
243            .map(|((i, j), correlation)| format!("\n  * ({}, {}) => {:.4}", i, j, correlation))
244            .for_each(|s| details.push_str(&s));
245
246        let module =
247            module::Module::new(result, "Pearson correlation threshold", None, Some(details));
248        get_report_base().push_module(module).try_build().unwrap()
249    }
250}
251
252/// Computes `iterations` number of hashes using the hasher provided in
253/// `build_hasher` and the data provided by `provider`.
254fn compute_hashes<H: BuildHasher>(
255    build_hasher: &H,
256    provider: &mut Box<dyn Provider>,
257    iterations: NonZeroUsize,
258) -> Vec<u64> {
259    info!("Computing {} hashes.", iterations);
260
261    provider
262        .provide(iterations.get())
263        .into_iter()
264        .enumerate()
265        .map(|(i, input)| {
266            if i % 1_000 == 0 && i > 0 {
267                debug!("Computed {} hashes.", i);
268            }
269
270            let mut hasher = build_hasher.build_hasher();
271            hasher.write(input);
272            hasher.finish()
273        })
274        .collect()
275}
276
277/// Extracts each bit within a set of hashes to a [`Vec`] of their own.
278///
279/// For example, bit 0 from every hash is pulled into the first [`Vec<f64>`]
280/// returned, bit 1 from every hash is pulled into the second [`Vec<f64>`]
281/// return, etc.
282fn extract_bit_values_from_hashes<T, const N: usize>(hashes: &[T]) -> Vec<Vec<f64>>
283where
284    T: Copy
285        + std::ops::Shr<usize, Output = T>
286        + std::ops::BitAnd<Output = T>
287        + From<u64>
288        + Into<u64>,
289{
290    info!(
291        "Extracing bit values across {} {}-bit hashes.",
292        hashes.len(),
293        N
294    );
295
296    let mut bit_values: Vec<Vec<f64>> = vec![Vec::new(); N];
297
298    for hash in hashes {
299        for (i, bit_value) in bit_values.iter_mut().enumerate() {
300            let as_byte = ((*hash >> i) & T::from(1_u64)).into();
301            bit_value.push(as_byte as f64);
302        }
303    }
304
305    bit_values
306}
307
308/// Populates the boilerplate report information within a
309/// [`Test`](section::Test).
310pub fn get_report_base() -> section::test::Builder {
311    let overview = "The bitwise correlation test assesses the correlation between each pair of \
312                    bits, known as 'bit-bit comparisons', for a set of output hashes using the \
313                    Pearson correlation coefficient.";
314
315    let algorithm =
316        "For a specified hash function, a provider and number of iterations is specified:\n\n(1) \
317         A random input is generated from the provider and the output hash is computed. This \
318         happens for the number of iterations specified, and the results are accumulated in an \
319         array. You can think of this as, roughly, a matrix of bits where each row is an output \
320         hash and each column is the bit at position _i_ of the output hash.\n\n(2) This matrix \
321         of bits is effectively transposed, meaning that an array is created for each bit \
322         position, with each array containing the values of that bit across all hashes. These \
323         arrays are referred to as 'bit values' for their respective bit positions.\n\n(3) For \
324         each pair of bit positions, the Pearson correlation is calculated between their \
325         corresponding arrays of bit values. This measures the level of correlation between every \
326         pair of output bits. Note that, though Pearson correlation is symmetric (meaning the \
327         correlation of (i, j) is the same as the correlation of (j, i)), all pairwise \
328         comparisons are computed.\n\n(4) The resulting correlations are stored in a HashMap, \
329         with each key being a tuple `(i, j)` representing the Pearson correlation coefficient \
330         between the bit values at position _i_ and the bit values at position _j_.";
331
332    let interpretation = "* A 'good' result is one where bits within the output hashes are not \
333                          highly correlated with one another. This indicates that the bits are \
334                          largely independent under the test.\n\n * There is one exception, \
335                          called the 'diagonal' of the correlation matrix. The rationale behind \
336                          this is straightforward: when an array of bit values at position _i_ is \
337                          compared against itself, the arrays are identical, and the correlation \
338                          should be 1.0. This presence of this phenomenon is often used as a \
339                          check to ensure that results are being calculated as expected.";
340
341    let sources = "* https://en.wikipedia.org/wiki/Pearson_correlation_coefficient";
342
343    test::Builder::default()
344        .title("Bitwise Pearson Correlation".to_string())
345        .unwrap()
346        .description(format!(
347            "{}\n\n{}\n\n{}\n\n{}\n\n{}\n\n{}\n\n{}\n\n{}",
348            "Overview".italic(),
349            overview,
350            "Algorithm".italic(),
351            algorithm,
352            "Interpretation".italic(),
353            interpretation,
354            "Sources".italic(),
355            sources
356        ))
357        .unwrap()
358}
359
360#[cfg(test)]
361mod tests {
362    use crate::correlation::bitwise::extract_bit_values_from_hashes;
363
364    #[test]
365    fn it_extracts_bit_values_from_hashes_correctly() {
366        let hashes: [u64; 4] = [0x00, 0x01, 0x02, 0x03];
367        let bit_values = extract_bit_values_from_hashes::<u64, 64>(&hashes[..]);
368
369        #[allow(clippy::needless_range_loop)]
370        for i in 2..64 {
371            assert_eq!(bit_values[i], &[0., 0., 0., 0.]);
372        }
373
374        assert_eq!(bit_values[1], &[0., 0., 1., 1.]);
375        assert_eq!(bit_values[0], &[0., 1., 0., 1.]);
376    }
377}