probabilistic_data_structures 0.1.0

Probabilistic data structures in Rust Lang.
Documentation
use log::debug;
use std::{
    collections::hash_map::DefaultHasher,
    hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
};

type HasherType = DefaultHasher;

#[derive(Default, Debug, Clone, Hash)]
pub struct BitMaps(pub Vec<u64>);

// #[derive(Default, Debug, Clone)]
// #[derive(Debug)]
pub struct PCSA {
    pub b:            u32,
    pub m:            u64,
    pub bitmaps_size: u64,
    pub bitmaps:      Vec<u64>,
    pub hasher:       Box<dyn BuildHasher<Hasher = HasherType>>,
}

/// PCSA -
/// * [Sketch of the Day: Probabilistic Counting with Stochastic Averaging (PCSA)](https://research.neustar.biz/2013/04/02/sketch-of-the-day-probabilistic-counting-with-stochastic-averaging-pcsa/)
/// * [Sketch of the Day: Probabilistic Counting with Stochastic Averaging (PCSA)](https://agkn.wordpress.com/2013/04/02/sketch-of-the-day-probabilistic-counting-with-stochastic-averaging-pcsa/)
/// * [Flajolet–Martin algorithm](https://en.wikipedia.org/wiki/Flajolet%E2%80%93Martin_algorithm)
/// * [Probabilistic Counting Algorithms for Data Base Applications](http://algo.inria.fr/flajolet/Publications/src/FlMa85.pdf)
impl PCSA {
    pub fn new() -> Self {
        // let m = 32;
        // m = 2^b # with b in [4...16]
        let b = 5;
        let m = 2u64.pow(b); // 32
        let bitmaps_size = m;
        let bitmaps = [0].repeat(bitmaps_size as usize);

        type Builder = BuildHasherDefault<HasherType>;
        let hasher = Builder::default();
        PCSA {
            b,
            m,
            bitmaps_size,
            bitmaps,
            hasher: Box::new(hasher),
        }
    }

    pub fn add(&mut self, record: String) -> std::io::Result<()> {
        // https://doc.rust-lang.org/std/primitive.u64.html
        // https://stackoverflow.com/questions/50458144/what-is-the-easiest-way-to-pad-a-string-with-0-to-the-left
        // use std::ops::{BitOr, BitOrAssign};

        let mut hasher = self.hasher.build_hasher();
        hasher.write(record.as_bytes());
        let hash = hasher.finish();

        print!(
            "PCSA.add:
                record                 ={:?}
                count_ones             ={:?}
                count_zeros            ={:?}
                leading_zeros          ={:?}
                trailing_zeros         ={:?}
                leading_ones           ={:?}
                trailing_ones          ={:?}
                hash                   ={:?}
                hash_binary            ={:?}
            ",
            record,
            hash.count_ones(),
            hash.count_zeros(),
            hash.leading_zeros(),
            hash.trailing_zeros(),
            -1,
            -1,
            hash,
            format!("{:064b}", hash),
        );

        let hash_bytes = hash.to_be_bytes();
        let leading_zeros = hash.leading_zeros();
        let leading_zeros_bytes = leading_zeros.to_be_bytes();

        let low_m_bits = hash & ((1 << self.b) - 1);
        let hi = 64 - self.b;
        let lo = self.b;
        let top = (hash >> lo) & ((1 << hi) - 1);
        let bottom = hash & ((1 << lo) - 1);

        // (x >> 3) & ((1 << 5)-1); // first 5 bits
        // (x >> 0) & ((1 << 3)-1) // lower 3 bits

        let bitmap_index = bottom as usize;
        let bitmap_old = self.bitmaps[bitmap_index].clone();

        let leading_zeros_new = top.leading_zeros();
        let mask = 1 << leading_zeros_new;
        let bitmap_new = bitmap_old | mask;
        let run_length = 0;

        print!(
            "
                run_length             ={:?}
                hash_bytes             ={:?}
                low_m_bits             ={:?}
                low_m_bits_binary      ={:?}
                top                    ={:?}
                top_binary             ={:?}
                bottom                 ={:?}
                bottom_binary          ={:?}
                leading_zeros_new      ={:?}
                leading_zeros          ={:?}
                leading_zeros_bytes    ={:?}
                bitmap_index           ={:?}
                leading_zeros_binary   ={:?}
                mask                   ={:?}
                mask_binary            ={:?}
                bitmap_old             ={:?}
                bitmap_value_old_binary={:?}
                bitmap_new             ={:?}
                bitmap_value_new_binary={:?}
            ",
            run_length,
            hash_bytes,
            low_m_bits,
            format!("{:064b}", low_m_bits),
            top,
            format!("{:064b}", top),
            bottom,
            format!("{:064b}", bottom),
            leading_zeros_new,
            leading_zeros,
            leading_zeros_bytes,
            bitmap_index,
            format!("{:064b}", leading_zeros),
            mask,
            format!("{:064b}", mask),
            bitmap_old,
            format!("{:064b}", bitmap_old),
            bitmap_new,
            format!("{:064b}", bitmap_new),
        );

        // set the bitmap bit based on the run length observed
        self.bitmaps[bitmap_index] = bitmap_new;

        Ok(())
    }

    pub fn cardinality(&self) -> std::io::Result<u64> {
        // Determine the cardinality
        // phi = 0.77351
        // DV = m / phi * 2 ^ (sum( least_sig_bit( bitmap ) ) / m) # the DV estimate
        let result = 0;
        debug!("PCSA.cardinality: result={:?}", result);
        Ok(result)
    }
}

// impl std::default::Default for PCSA {
//     fn default() -> Self {
//         PCSA {
//             bitmaps_size: 32,
//             bitmaps: Vec::new(),
//             bitmaps_test_1: BitMaps(Vec::new()),
//             hasher: Box::new(DefaultHasher::new()),
//         }
//     }
// }

impl std::fmt::Debug for PCSA {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        let bitmaps_as_str = self
            .bitmaps
            .iter()
            .map(|bitmap_value| format!("{:064b}", bitmap_value))
            .collect::<Vec<_>>()
            .join(", ");
        write!(
            f,
            "[
              b={:?}
              m={:?}
              bitmaps_size={:?}
              bitmaps={:?}
            ]",
            self.b, self.m, self.bitmaps_size, bitmaps_as_str,
        )
    }
}

impl std::fmt::Display for PCSA {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        let bitmaps_as_str = self
            .bitmaps
            .iter()
            .map(|bitmap_value| format!("{:064b}", bitmap_value))
            .collect::<Vec<_>>()
            .join(", ");
        write!(
            f,
            "[
              b={:?}
              m={:?}
              bitmaps_size={:?}
              bitmaps={:?}
            ]",
            self.b, self.m, self.bitmaps_size, bitmaps_as_str,
        )
    }
}

#[cfg(test)]
mod tests {
    #[test]
    fn it_works() {
        assert_eq!(2 + 2, 4);
    }
}