c2_histograms 0.2.4

Tools for histogram compression and usage
Documentation
use std::collections::HashMap;
use crate::c2::{CompressedParams, CompressedHistogram, CompactHistogram, CompactParams, C2Params, C2Histogram};

// -----------------------------------------------------------------------------------------
// --- GENERAL TRAITS ----------------------------------------------------------------------
// -----------------------------------------------------------------------------------------

/// For many of the histogram space optimizations, a set of parameters are needed to
/// interpret the data in memory as a histogram. These parameters are often needed to create
/// a more complex kind of histogram, and they always must be passed in the histogram's
/// [`labels`] and [`frequency`] functions.
///
/// [`labels`]: trait.Histogram.html#tymethod.labels
/// [`frequency`]: trait.Histogram.html#tymethod.frequency
pub trait HistogramParams {}

/// Specifies the behaviors that are necessary to be a histogram. It is generic
/// and parameterized by `P`, the type of parameters necessary, `L`, the type
/// of the labels, and `F`, the type that stores the frequency.  Both the labels
/// and the frequencies need to be some sort of number.
pub trait Histogram<P, L, F> where P: HistogramParams {
    /// If possible, creates and returns an iterator for the labels for which
    /// the histogram has frequencies stored.
    fn labels(&self, params : &P) -> Option<Box<dyn Iterator<Item=L>>>;

    /// If possible, returns the frequency of a particular label.
    /// The behavior need only be defined if the label is in the
    /// set of values returned from labels.
    fn frequency(&self, label : L, params : &P) -> Option<F>;

    /// the total number of data-points in the histogram
    fn total(&self) -> F;
}

// -----------------------------------------------------------------------------------------
// --- TESTING FOR STANDARD HISTOGRAM ------------------------------------------------------
// -----------------------------------------------------------------------------------------

#[cfg(test)]
mod standard_testing {
    // test the general behavior as well as compacting them
    // TODO set up tests for compression as well, and c2

    use std::collections::HashMap;
    use crate::standard::{create_standard_histogram, NullParams, StandardHistogram, Histogram};
    use crate::c2::{new_compressed_params, new_compact_params, new_c2_params};

    #[test]
    fn init() {
        let mut d = HashMap::new();
        d.insert(1, 40);
        d.insert(3, 15);
        d.insert(50, 1);

        let hist = create_standard_histogram(d);
        let np = NullParams {};

        assert_eq!(hist.total(), 56);
        assert_eq!(hist.frequency(1, &np), Some(40));
        assert_eq!(hist.frequency(3, &np), Some(15));
        assert_eq!(hist.frequency(50, &np), Some(1));
        assert_eq!(hist.frequency(4, &np), None);
        for i in hist.labels(&np).unwrap(){
            assert!(i == 1 || i == 3 || i == 50)
        }
    }

    fn get_sample_n_hist() -> StandardHistogram {
        let mut d = HashMap::new();
        d.insert(1, 40);
        d.insert(3, 15);
        d.insert(50, 1);

        create_standard_histogram(d)
    }

    #[test]
    fn compaction1(){
        let cp = new_compact_params(5.0, 6, 7).unwrap();
        let hist = get_sample_n_hist().to_compact(&cp).unwrap();

        assert_eq!(hist.total(), 56);
        assert_eq!(hist.frequency(1, &cp), Some(55));
        assert_eq!(hist.frequency(3, &cp), Some(55));
        assert_eq!(hist.frequency(50, &cp), Some(1));
        assert_eq!(hist.frequency(4, &cp), Some(55));
        assert_eq!(hist.frequency(126, &cp), Some(0));

    }

    #[test]
    fn compaction2(){
        let cp = new_compact_params(2.0, 9, 7).unwrap();
        let hist = get_sample_n_hist().to_compact(&cp).unwrap();

        assert_eq!(hist.total(), 56);
        assert_eq!(hist.frequency(1, &cp), Some(40));
        assert_eq!(hist.frequency(3, &cp), Some(15));
        assert_eq!(hist.frequency(50, &cp), Some(1));
        assert_eq!(hist.frequency(126, &cp), Some(0));

    }

    #[test]
    fn compression1() {
        let cp = new_compressed_params(2.0, 9).unwrap();
        let hist = get_sample_n_hist().to_compressed(&cp).unwrap();

        assert_eq!(hist.total(), 82);
        assert_eq!(hist.frequency(1, &cp), Some(64));
        assert_eq!(hist.frequency(3, &cp), Some(16));
        assert_eq!(hist.frequency(50, &cp), Some(2));
        assert!(hist.frequency(126, &cp).is_none());
    }

    #[test]
    fn compression2() {
        let cp = new_compressed_params(1.5, 30).unwrap();
        let hist = get_sample_n_hist().to_compressed(&cp).unwrap();

        assert_eq!(hist.total(), 75);
        assert_eq!(hist.frequency(1, &cp), Some(57));
        assert_eq!(hist.frequency(3, &cp), Some(17));
        assert_eq!(hist.frequency(50, &cp), Some(1));
        assert!(hist.frequency(126, &cp).is_none());
    }

    #[test]
    fn c2_1(){
        let cp = new_compressed_params(1.3, 63).unwrap();
        let cp2 = new_compact_params(14.0, 8, 15).unwrap();
        let cp = new_c2_params(cp2, cp);

        let hist = get_sample_n_hist().to_c2(&cp).unwrap();

        assert_eq!(hist.total(), 67);
        assert_eq!(hist.frequency(1, &cp), Some(66));
        assert_eq!(hist.frequency(3, &cp), Some(66));
        assert_eq!(hist.frequency(50, &cp), Some(1));

    }

}

// -----------------------------------------------------------------------------------------
// --- DEFINE STANDARD HISTOGRAM -----------------------------------------------------------
// -----------------------------------------------------------------------------------------

/// These are empty parameters, standard histograms don't need any extra information.
///
/// # Example
/// Basic Initialization is sufficient
/// ```
/// # use c2_histograms::standard::NullParams;
/// let np = NullParams {};
/// ```
pub struct NullParams {}

impl HistogramParams for NullParams {}

/// `StandardHistogram` is a naive implementation of a histogram with no optimizations.
/// It has optimal performance since it stores an exact count for exact labels, but it can take
/// an arbitrary amount of space. It uses a library HashMap to store label-frequency pairs, and
/// can be instantiated with one using the [`create_standard_histogram`] function.
/// `StandardHistogram` is associated with NullParams.
///
/// # Example
/// To create, pass a HashMap to the constructor function:
/// ```
/// # use c2_histograms::standard::{StandardHistogram, create_standard_histogram};
/// # use std::collections::HashMap;
/// let mut data = HashMap::new();
/// data.insert(1, 40);
/// data.insert(3, 15);
/// data.insert(50, 1);
///
/// let histogram :StandardHistogram = create_standard_histogram(data);
/// ```
///
/// [`create_standard_histogram`]: fn.create_standard_histogram.html
pub struct StandardHistogram {
    // maps from reuse interval to frequency
    data : HashMap<usize, usize>,
    total : usize
}

impl StandardHistogram {
    pub(crate) fn data(&self) -> &HashMap<usize, usize> { &self.data }
}

/// Constructs a `StandardHistogram` from the given HashMap
pub fn create_standard_histogram(data : HashMap<usize, usize>) -> StandardHistogram {
    let total : usize = data.values().sum();
    StandardHistogram { data, total }
}

impl Histogram<NullParams, usize, usize> for StandardHistogram {
    fn labels(&self, _params: &NullParams) -> Option<Box<dyn Iterator<Item=usize>>> {
        let mut out : Vec<usize> = vec![];
        let mut counter : usize = 0;
        for k in self.data.keys(){
            out.insert(counter, *k);
            counter += 1;
        }
        Some(Box::new(out.into_iter()))
    }

    fn frequency(&self, label: usize, _params: &NullParams) -> Option<usize> {
        if let Some(t) = self.data.get(&label) {
            Some(*t)
        }
        else{
            None
        }
    }

    fn total(&self) -> usize {
        self.total
    }
}

// ----------------------------------------------------------------------------------
// --- TRANSFORMATION FUNCTIONS -----------------------------------------------------
// ----------------------------------------------------------------------------------

impl StandardHistogram {
    /// If possible, creates a `CompactHistogram` of this histogram.
    ///
    /// # Example
    /// ```
    /// # use c2_histograms::standard::{StandardHistogram, create_standard_histogram};
    /// # use std::collections::HashMap;
    /// use c2_histograms::c2::{CompactHistogram, new_compact_params, CompactParams};
    /// # let mut data = HashMap::new();
    /// # data.insert(1, 40);
    /// # data.insert(3, 15);
    /// # data.insert(50, 1);
    ///
    /// let histogram : StandardHistogram = create_standard_histogram(data);
    /// let cp : CompactParams = new_compact_params(3.5, 8, 8).unwrap();
    ///
    /// let compact_histogram : CompactHistogram = histogram.to_compact(&cp).unwrap();
    /// ```
    ///
    pub fn to_compact(&self, cp : &CompactParams) -> Option<CompactHistogram> {
        let mut overflow = 0;
        let nb = cp.num_buckets();
        let mut counters :Vec<usize> = vec![];
        let mut sums :Vec<usize> = vec![];
        let mut sub_range_averages :Vec<usize> = vec![];
        for i in 0..nb {
            counters.insert(i, 0);
            sums.insert(i, 0);
            sub_range_averages.insert(i, 0);
        }

        for lab in self.labels(&NullParams {} ).unwrap() {
            if let Some(bi) = cp.bucket_index(lab) {
                counters[bi] += *self.data().get(&lab).unwrap();
                let sum : usize = self.data().values().sum();
                sums[bi] += sum;
            } else {
                overflow += *self.data().get(&lab).unwrap();
            }
        }

        for i in 0..nb {
            let (min, max) = cp.range(i).unwrap();
            let avg = (sums[i] as f64) / (counters[i] as f64);

            let range = (max - min) as f64;
            // if k is greater than range, do the additive thing
            if cp.k() as f64 > range {
                sub_range_averages.insert(i, ((avg - min as f64) / range).floor() as usize);
            } else { // otherwise do proportional thing
                sub_range_averages.insert(i, (((avg - min as f64) / range) * cp.k() as f64).floor() as usize);
            }
        }
        if counters.iter().min() == counters.iter().max() {
            None
        } else {
            Some(CompactHistogram { counters, sub_range_averages, overflowed : overflow  } )
        }
    }

    /// If possible, creates a `CompressedHistogram` of this histogram.
    ///
    /// # Example
    /// ```
    /// # use c2_histograms::standard::{StandardHistogram, create_standard_histogram};
    /// # use std::collections::HashMap;
    /// use c2_histograms::c2::{CompressedHistogram, new_compressed_params, CompressedParams};
    /// # let mut data = HashMap::new();
    /// # data.insert(1, 40);
    /// # data.insert(3, 15);
    /// # data.insert(50, 1);
    ///
    /// let histogram : StandardHistogram = create_standard_histogram(data);
    /// let cp : CompressedParams = new_compressed_params(1.5, 255).unwrap();
    ///
    /// let compact_histogram : CompressedHistogram = histogram.to_compressed(&cp).unwrap();
    /// ```
    ///
    pub fn to_compressed(&self, cp : &CompressedParams) -> Option<CompressedHistogram> {
        // we just need to transform the data into compressed form
        let mut outdata = HashMap::new();
        let mut tot = 0;
        let mut sum = 0;

        for k in self.data.keys() {
            let count = *self.data.get(k).unwrap();
            let comp = cp.compress_count(count).unwrap();
            outdata.insert(*k, comp);
            sum += cp.decompress_count(comp).unwrap();
            tot += count;
        }

        Some(CompressedHistogram { data: outdata, total: sum, orig_total: tot } )
    }

    /// If possible, creates a `C2Histogram` of this histogram
    /// # Example
    /// ```
    /// # use c2_histograms::standard::{StandardHistogram, create_standard_histogram};
    /// # use std::collections::HashMap;
    /// use c2_histograms::c2::{CompactHistogram, new_compact_params, CompactParams, new_c2_params, C2Params, C2Histogram};
    /// use c2_histograms::c2::{CompressedHistogram, new_compressed_params, CompressedParams};
    /// # let mut data = HashMap::new();
    /// # data.insert(1, 40);
    /// # data.insert(3, 15);
    /// # data.insert(50, 1);
    ///
    /// let histogram : StandardHistogram = create_standard_histogram(data);
    ///
    /// let cp1 : CompactParams = new_compact_params(3.5, 8, 8).unwrap();
    /// let cp2 : CompressedParams = new_compressed_params(1.5, 255).unwrap();
    ///
    /// let cp : C2Params = new_c2_params(cp1, cp2);
    ///
    /// let compact_histogram : C2Histogram = histogram.to_c2(&cp).unwrap();
    /// ```
    ///
    pub fn to_c2(&self, c2p : &C2Params) -> Option<C2Histogram> {
        let pressed_ps = c2p.compressed_ps();
        let pact_ps = c2p.compact_ps();
        if let Some(compact) = self.to_compact(&pact_ps) {
            let orig_total = compact.total() - compact.overflowed;
            let mut n_total = 0;
            let mut counters = vec![];

            let mut b = 0;
            for lab in compact.labels(&pact_ps).unwrap() {
                let comp_count = compact.frequency(lab, &pact_ps).unwrap();
                if comp_count == 0 {
                    counters.insert(b, comp_count as u16);
                    b += 1;
                    continue;
                }
                let comp_count = pressed_ps.compress_count(comp_count).unwrap();
                counters.insert(b, comp_count);
                n_total += pressed_ps.decompress_count(comp_count).unwrap();
                b += 1;
            }

            Some(C2Histogram { counters, sub_range_averages: compact.sub_range_averages, orig_total,
                                total : n_total,
                                overflowed : compact.overflowed})
        } else {
            None
        }
    }
}