1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
use std::collections::hash_map::{HashMap, Entry};
use std::fmt;
use std::hash::Hash;
use std::iter::{FromIterator, IntoIterator};
use std::default::Default;

use Commute;

/// A commutative data structure for exact frequency counts.
#[derive(Clone)]
pub struct Frequencies<T> {
    data: HashMap<T, u64>,
}

impl<T: fmt::Debug + Eq + Hash> fmt::Debug for Frequencies<T> {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "{:?}", self.data)
    }
}

impl<T: Eq + Hash> Frequencies<T> {
    /// Create a new frequency table with no samples.
    pub fn new() -> Frequencies<T> {
        Default::default()
    }

    /// Add a sample to the frequency table.
    pub fn add(&mut self, v: T) {
        match self.data.entry(v) {
            Entry::Vacant(count) => { count.insert(1); },
            Entry::Occupied(mut count) => { *count.get_mut() += 1; },
        }
    }

    /// Return the number of occurrences of `v` in the data.
    pub fn count(&self, v: &T) -> u64 {
        self.data.get(v).map(|&v| v).unwrap_or(0)
    }

    /// Return the cardinality (number of unique elements) in the data.
    pub fn cardinality(&self) -> u64 {
        self.len() as u64
    }

    /// Returns the mode if one exists.
    pub fn mode(&self) -> Option<&T> {
        let counts = self.most_frequent();
        if counts.is_empty() {
            None
        } else if counts.len() >= 2 && counts[0].1 == counts[1].1 {
            None
        } else {
            Some(counts[0].0)
        }
    }

    /// Return a `Vec` of elements and their corresponding counts in
    /// descending order.
    pub fn most_frequent(&self) -> Vec<(&T, u64)> {
        let mut counts: Vec<_> = self.data.iter()
                                          .map(|(k, &v)| (k, v))
                                          .collect();
        counts.sort_by(|&(_, c1), &(_, c2)| c2.cmp(&c1));
        counts
    }

    /// Return a `Vec` of elements and their corresponding counts in
    /// ascending order.
    pub fn least_frequent(&self) -> Vec<(&T, u64)> {
        let mut counts: Vec<_> = self.data.iter()
                                          .map(|(k, &v)| (k, v))
                                          .collect();
        counts.sort_by(|&(_, c1), &(_, c2)| c1.cmp(&c2));
        counts
    }

    /// Returns the cardinality of the data.
    pub fn len(&self) -> usize {
        self.data.len()
    }
}

impl<T: Eq + Hash> Commute for Frequencies<T> {
    fn merge(&mut self, v: Frequencies<T>) {
        for (k, v2) in v.data.into_iter() {
            match self.data.entry(k) {
                Entry::Vacant(v1) => { v1.insert(v2); }
                Entry::Occupied(mut v1) => { *v1.get_mut() += v2; }
            }
        }
    }
}

impl<T: Eq + Hash> Default for Frequencies<T> {
    fn default() -> Frequencies<T> {
        Frequencies { data: HashMap::with_capacity(100000) }
    }
}

impl<T: Eq + Hash> FromIterator<T> for Frequencies<T> {
    fn from_iter<I: IntoIterator<Item=T>>(it: I) -> Frequencies<T> {
        let mut v = Frequencies::new();
        v.extend(it);
        v
    }
}

impl<T: Eq + Hash> Extend<T> for Frequencies<T> {
    fn extend<I: IntoIterator<Item=T>>(&mut self, it: I) {
        for sample in it {
            self.add(sample);
        }
    }
}

#[cfg(test)]
mod test {
    use super::Frequencies;

    #[test]
    fn ranked() {
        let mut counts = Frequencies::new();
        counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4].into_iter());
        assert_eq!(counts.most_frequent()[0], (&2, 5));
        assert_eq!(counts.least_frequent()[0], (&3, 1));
    }
}