quantiles/
misra_gries.rs

1//! Misra-Gries calculates an ε-approximate frequency count for a stream of N
2//! elements. The output is the k most frequent elements.
3//!
4//! 1. the approximate count f'[e] is smaller than the true frequency f[e] of e,
5//!    but by at most εN, i.e., (f[e] - εN) ≤ f'[e] ≤ f[e]
6//! 2. any element e with a frequency f[e] ≥ εN appears in the result set
7//!
8//! The error bound ε = 1/(k+1) where k is the number of counters used in the
9//! algorithm.
10//! When k = 1 i.e. a single counter, the algorithm is equivalent to the
11//! Boyer-Moore Majority algorithm.
12//!
13//! If you want to check for elements that appear at least εN times, you will
14//! want to perform a second pass to calculate the exact frequencies of the
15//! values in the result set which can be done in constant space.
16//!
17//! `@article{MISRA1982143,
18//! title = "Finding repeated elements",
19//! journal = "Science of Computer Programming",
20//! volume = "2",
21//! number = "2",
22//! pages = "143 - 152",
23//! year = "1982",
24//! issn = "0167-6423",
25//! doi = "http://dx.doi.org/10.1016/0167-6423(82)90012-0",
26//! url = "http://www.sciencedirect.com/science/article/pii/0167642382900120",
27//! author = "J. Misra and David Gries",
28//! }`
29//!
30//! # Examples
31//!
32//! ```
33//! use quantiles::misra_gries::*;
34//!
35//! let k: usize = 3;
36//! let numbers: Vec<u32> = vec![1,3,2,1,3,4,3,1,2,1];
37//! let counts = misra_gries(numbers.iter(), k);
38//! let bound = numbers.len() / (k+1);
39//! let in_range = |f_expected: usize, f_approx: usize| {
40//!   f_approx <= f_expected &&
41//!   (bound >= f_expected || f_approx >= (f_expected - bound))
42//! };
43//! assert!(in_range(4usize, *counts.get(&1).unwrap()));
44//! assert!(in_range(2usize, *counts.get(&2).unwrap()));
45//! assert!(in_range(3usize, *counts.get(&3).unwrap()));
46//! ```
47
48use std::collections::BTreeMap;
49use std::collections::btree_map::Entry;
50
51/// Calculates the `k` most frequent elements in the iterable
52/// stream of elements `stream` using an ε-approximate frequency count where ε
53/// = 1/(k+1)
54pub fn misra_gries<I, V>(stream: I, k: usize) -> BTreeMap<V, usize>
55where
56    I: IntoIterator<Item = V>,
57    V: Ord + Clone,
58{
59    let mut counters = BTreeMap::new();
60    for i in stream {
61        let counters_len = counters.len();
62        let mut counted = false;
63
64        match counters.entry(i.clone()) {
65            Entry::Occupied(mut item) => {
66                *item.get_mut() += 1;
67                counted = true;
68            }
69            Entry::Vacant(slot) => if counters_len < k {
70                slot.insert(1);
71                counted = true;
72            },
73        }
74
75        if !counted {
76            for c in counters.values_mut() {
77                *c -= 1;
78            }
79
80            counters = counters.into_iter().filter(|&(_, v)| v != 0).collect();
81        }
82    }
83
84    counters
85}
86
87#[cfg(test)]
88mod test {
89    use super::*;
90    use std::collections::BTreeMap;
91
92    /// Calculate exact element frequencies using O(n) space.
93    pub fn exact_frequencies<I, V>(stream: I) -> BTreeMap<V, usize>
94    where
95        I: IntoIterator<Item = V>,
96        V: Ord + Clone,
97    {
98        let mut counts = BTreeMap::new();
99        for i in stream {
100            *counts.entry(i.clone()).or_insert(0) += 1;
101        }
102        counts
103    }
104
105    #[test]
106    fn test_exact_frequencies() {
107        let numbers = vec![1, 2, 1, 3, 3, 1, 2, 4];
108        let counts = exact_frequencies(numbers.iter());
109        assert_eq!(*counts.get(&1).unwrap() as u32, 3);
110        assert_eq!(*counts.get(&2).unwrap() as u32, 2);
111        assert_eq!(*counts.get(&3).unwrap() as u32, 2);
112        assert_eq!(*counts.get(&4).unwrap() as u32, 1);
113    }
114
115    quickcheck! {
116        fn is_exact(xs: Vec<u32>) -> bool {
117            exact_frequencies(xs.iter()) == misra_gries(xs.iter(), xs.len())
118        }
119
120        fn is_approximate(xs: Vec<u32>) -> bool {
121            //(f[e] − εN) ≤ f'[e] ≤ f[e]
122
123            let exacts = exact_frequencies(xs.iter());
124            let n = xs.len();
125
126            for k in 1..n {
127                let epsilon_n = n / (k+1);
128                let approxes = misra_gries(xs.iter(), k);
129
130                for (i, c) in approxes {
131                    let exact = *exacts.get(i).unwrap();
132
133                    if c > exact {
134                        return false;
135                    }
136
137                    if epsilon_n < exact && c < (exact - epsilon_n) {
138                        return false;
139                    }
140                }
141            }
142
143            true
144        }
145    }
146}