quantiles/misra_gries.rs
1//! Misra-Gries calculates an ε-approximate frequency count for a stream of N
2//! elements. The output is the k most frequent elements.
3//!
4//! 1. the approximate count f'[e] is smaller than the true frequency f[e] of e,
5//! but by at most εN, i.e., (f[e] - εN) ≤ f'[e] ≤ f[e]
6//! 2. any element e with a frequency f[e] ≥ εN appears in the result set
7//!
8//! The error bound ε = 1/(k+1) where k is the number of counters used in the
9//! algorithm.
10//! When k = 1 i.e. a single counter, the algorithm is equivalent to the
11//! Boyer-Moore Majority algorithm.
12//!
13//! If you want to check for elements that appear at least εN times, you will
14//! want to perform a second pass to calculate the exact frequencies of the
15//! values in the result set which can be done in constant space.
16//!
17//! `@article{MISRA1982143,
18//! title = "Finding repeated elements",
19//! journal = "Science of Computer Programming",
20//! volume = "2",
21//! number = "2",
22//! pages = "143 - 152",
23//! year = "1982",
24//! issn = "0167-6423",
25//! doi = "http://dx.doi.org/10.1016/0167-6423(82)90012-0",
26//! url = "http://www.sciencedirect.com/science/article/pii/0167642382900120",
27//! author = "J. Misra and David Gries",
28//! }`
29//!
30//! # Examples
31//!
32//! ```
33//! use quantiles::misra_gries::*;
34//!
35//! let k: usize = 3;
36//! let numbers: Vec<u32> = vec![1,3,2,1,3,4,3,1,2,1];
37//! let counts = misra_gries(numbers.iter(), k);
38//! let bound = numbers.len() / (k+1);
39//! let in_range = |f_expected: usize, f_approx: usize| {
40//! f_approx <= f_expected &&
41//! (bound >= f_expected || f_approx >= (f_expected - bound))
42//! };
43//! assert!(in_range(4usize, *counts.get(&1).unwrap()));
44//! assert!(in_range(2usize, *counts.get(&2).unwrap()));
45//! assert!(in_range(3usize, *counts.get(&3).unwrap()));
46//! ```
47
48use std::collections::BTreeMap;
49use std::collections::btree_map::Entry;
50
51/// Calculates the `k` most frequent elements in the iterable
52/// stream of elements `stream` using an ε-approximate frequency count where ε
53/// = 1/(k+1)
54pub fn misra_gries<I, V>(stream: I, k: usize) -> BTreeMap<V, usize>
55where
56 I: IntoIterator<Item = V>,
57 V: Ord + Clone,
58{
59 let mut counters = BTreeMap::new();
60 for i in stream {
61 let counters_len = counters.len();
62 let mut counted = false;
63
64 match counters.entry(i.clone()) {
65 Entry::Occupied(mut item) => {
66 *item.get_mut() += 1;
67 counted = true;
68 }
69 Entry::Vacant(slot) => if counters_len < k {
70 slot.insert(1);
71 counted = true;
72 },
73 }
74
75 if !counted {
76 for c in counters.values_mut() {
77 *c -= 1;
78 }
79
80 counters = counters.into_iter().filter(|&(_, v)| v != 0).collect();
81 }
82 }
83
84 counters
85}
86
87#[cfg(test)]
88mod test {
89 use super::*;
90 use std::collections::BTreeMap;
91
92 /// Calculate exact element frequencies using O(n) space.
93 pub fn exact_frequencies<I, V>(stream: I) -> BTreeMap<V, usize>
94 where
95 I: IntoIterator<Item = V>,
96 V: Ord + Clone,
97 {
98 let mut counts = BTreeMap::new();
99 for i in stream {
100 *counts.entry(i.clone()).or_insert(0) += 1;
101 }
102 counts
103 }
104
105 #[test]
106 fn test_exact_frequencies() {
107 let numbers = vec![1, 2, 1, 3, 3, 1, 2, 4];
108 let counts = exact_frequencies(numbers.iter());
109 assert_eq!(*counts.get(&1).unwrap() as u32, 3);
110 assert_eq!(*counts.get(&2).unwrap() as u32, 2);
111 assert_eq!(*counts.get(&3).unwrap() as u32, 2);
112 assert_eq!(*counts.get(&4).unwrap() as u32, 1);
113 }
114
115 quickcheck! {
116 fn is_exact(xs: Vec<u32>) -> bool {
117 exact_frequencies(xs.iter()) == misra_gries(xs.iter(), xs.len())
118 }
119
120 fn is_approximate(xs: Vec<u32>) -> bool {
121 //(f[e] − εN) ≤ f'[e] ≤ f[e]
122
123 let exacts = exact_frequencies(xs.iter());
124 let n = xs.len();
125
126 for k in 1..n {
127 let epsilon_n = n / (k+1);
128 let approxes = misra_gries(xs.iter(), k);
129
130 for (i, c) in approxes {
131 let exact = *exacts.get(i).unwrap();
132
133 if c > exact {
134 return false;
135 }
136
137 if epsilon_n < exact && c < (exact - epsilon_n) {
138 return false;
139 }
140 }
141 }
142
143 true
144 }
145 }
146}