1use foldhash::{HashMap, HashMapExt};
2use std::collections::hash_map::{Entry, Keys};
3use std::hash::Hash;
4
5use rayon::prelude::*;
6
7use crate::Commute;
8#[derive(Clone)]
10pub struct Frequencies<T> {
11 data: HashMap<T, u64>,
12}
13
14#[cfg(debug_assertions)]
15impl<T: std::fmt::Debug + Eq + Hash> std::fmt::Debug for Frequencies<T> {
16 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
17 write!(f, "{:?}", self.data)
18 }
19}
20
21impl<T: Eq + Hash> Frequencies<T> {
22 #[must_use]
24 pub fn new() -> Frequencies<T> {
25 Default::default()
26 }
27
28 #[must_use]
30 pub fn with_capacity(capacity: usize) -> Self {
31 Frequencies {
32 data: HashMap::with_capacity(capacity),
33 }
34 }
35
36 #[inline]
38 pub fn add(&mut self, v: T) {
39 *self.data.entry(v).or_insert(0) += 1;
40 }
41
42 #[inline]
44 #[must_use]
45 pub fn count(&self, v: &T) -> u64 {
46 self.data.get(v).copied().unwrap_or(0)
47 }
48
49 #[inline]
51 #[must_use]
52 pub fn cardinality(&self) -> u64 {
53 self.len() as u64
54 }
55
56 #[inline]
61 #[must_use]
62 pub fn mode(&self) -> Option<&T> {
63 let (counts, _) = self.most_frequent();
64 if counts.is_empty() {
65 return None;
66 }
67 if counts.len() >= 2 && counts[0].1 == counts[1].1 {
69 None
70 } else {
71 Some(counts[0].0)
72 }
73 }
74
75 #[inline]
78 #[must_use]
79 pub fn most_frequent(&self) -> (Vec<(&T, u64)>, u64) {
80 let len = self.data.len();
81 let mut counts = Vec::with_capacity(len);
82 let mut total_count = 0_u64;
83
84 for (k, &v) in &self.data {
85 total_count += v;
86 counts.push((k, v));
87 }
88 counts.sort_unstable_by(|&(_, c1), &(_, c2)| c2.cmp(&c1));
89 (counts, total_count)
90 }
91
92 #[inline]
95 #[must_use]
96 pub fn least_frequent(&self) -> (Vec<(&T, u64)>, u64) {
97 let mut total_count = 0_u64;
98 let mut counts: Vec<_> = self
99 .data
100 .iter()
101 .map(|(k, &v)| {
102 total_count += v;
103 (k, v)
104 })
105 .collect();
106 counts.sort_unstable_by(|&(_, c1), &(_, c2)| c1.cmp(&c2));
107 (counts, total_count)
108 }
109
110 #[inline]
113 #[must_use]
114 pub fn par_frequent(&self, least: bool) -> (Vec<(&T, u64)>, u64)
115 where
116 for<'a> (&'a T, u64): Send,
117 T: Ord,
118 {
119 let mut total_count = 0_u64;
120 let mut counts: Vec<_> = self
121 .data
122 .iter()
123 .map(|(k, &v)| {
124 total_count += v;
125 (k, v)
126 })
127 .collect();
128 if least {
132 counts.par_sort_unstable_by(|&(v1, c1), &(v2, c2)| {
134 let cmp = c1.cmp(&c2);
135 if cmp == std::cmp::Ordering::Equal {
136 v1.cmp(v2)
137 } else {
138 cmp
139 }
140 });
141 } else {
142 counts
144 .par_sort_unstable_by(|&(v1, c1), &(v2, c2)| c2.cmp(&c1).then_with(|| v1.cmp(v2)));
145 }
146 (counts, total_count)
147 }
148
149 #[must_use]
151 pub fn len(&self) -> usize {
152 self.data.len()
153 }
154
155 #[must_use]
157 pub fn is_empty(&self) -> bool {
158 self.data.is_empty()
159 }
160
161 #[must_use]
163 pub fn unique_values(&self) -> UniqueValues<'_, T> {
164 UniqueValues {
165 data_keys: self.data.keys(),
166 }
167 }
168
169 #[must_use]
172 pub fn top_n(&self, n: usize) -> Vec<(&T, u64)>
173 where
174 T: Ord,
175 {
176 use std::collections::BinaryHeap;
177
178 let mut heap = BinaryHeap::with_capacity(n + 1);
180
181 for (item, count) in &self.data {
182 heap.push(std::cmp::Reverse((*count, item)));
185
186 if heap.len() > n {
188 heap.pop();
189 }
190 }
191
192 heap.into_sorted_vec()
194 .into_iter()
195 .map(|std::cmp::Reverse((count, item))| (item, count))
196 .collect()
197 }
198
199 #[must_use]
201 pub fn bottom_n(&self, n: usize) -> Vec<(&T, u64)>
202 where
203 T: Ord,
204 {
205 use std::collections::BinaryHeap;
206
207 let mut heap = BinaryHeap::with_capacity(n + 1);
208
209 for (item, count) in &self.data {
210 heap.push((*count, item));
211 if heap.len() > n {
212 heap.pop();
213 }
214 }
215
216 heap.into_sorted_vec()
217 .into_iter()
218 .map(|(count, item)| (item, count))
219 .collect()
220 }
221
222 #[must_use]
224 pub fn items_with_count(&self, n: u64) -> Vec<&T> {
225 self.data
226 .iter()
227 .filter(|&(_, &count)| count == n)
228 .map(|(item, _)| item)
229 .collect()
230 }
231
232 #[must_use]
234 pub fn total_count(&self) -> u64 {
235 self.data.values().sum()
236 }
237
238 #[must_use]
240 pub fn has_count(&self, n: u64) -> bool {
241 self.data.values().any(|&count| count == n)
242 }
243
244 #[inline]
246 pub fn increment_by(&mut self, v: T, count: u64) {
247 match self.data.entry(v) {
248 Entry::Vacant(entry) => {
249 entry.insert(count);
250 }
251 Entry::Occupied(mut entry) => {
252 *entry.get_mut() += count;
253 }
254 }
255 }
256}
257
258impl<T: Eq + Hash> Commute for Frequencies<T> {
259 #[inline]
260 fn merge(&mut self, v: Frequencies<T>) {
261 self.data.reserve(v.data.len());
263
264 for (k, v2) in v.data {
265 match self.data.entry(k) {
266 Entry::Vacant(v1) => {
267 v1.insert(v2);
268 }
269 Entry::Occupied(mut v1) => {
270 *v1.get_mut() += v2;
271 }
272 }
273 }
274 }
275}
276
277impl<T: Eq + Hash> Default for Frequencies<T> {
278 #[inline]
279 fn default() -> Frequencies<T> {
280 Frequencies {
281 data: HashMap::with_capacity(1_000),
282 }
283 }
284}
285
286impl<T: Eq + Hash> FromIterator<T> for Frequencies<T> {
287 #[inline]
288 fn from_iter<I: IntoIterator<Item = T>>(it: I) -> Frequencies<T> {
289 let mut v = Frequencies::new();
290 v.extend(it);
291 v
292 }
293}
294
295impl<T: Eq + Hash> Extend<T> for Frequencies<T> {
296 #[inline]
297 fn extend<I: IntoIterator<Item = T>>(&mut self, it: I) {
298 for sample in it {
299 self.add(sample);
300 }
301 }
302}
303
304pub struct UniqueValues<'a, K> {
306 data_keys: Keys<'a, K, u64>,
307}
308
309impl<'a, K> Iterator for UniqueValues<'a, K> {
310 type Item = &'a K;
311 fn next(&mut self) -> Option<Self::Item> {
312 self.data_keys.next()
313 }
314}
315
316#[cfg(test)]
317mod test {
318 use super::Frequencies;
319 use std::iter::FromIterator;
320
321 #[test]
322 fn ranked() {
323 let mut counts = Frequencies::new();
324 counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
325 let (most_count, most_total) = counts.most_frequent();
326 assert_eq!(most_count[0], (&2, 5));
327 assert_eq!(most_total, 11);
328 let (least_count, least_total) = counts.least_frequent();
329 assert_eq!(least_count[0], (&3, 1));
330 assert_eq!(least_total, 11);
331 assert_eq!(
332 counts.least_frequent(),
333 (vec![(&3, 1), (&1, 2), (&4, 3), (&2, 5)], 11)
334 );
335 }
336
337 #[test]
338 fn ranked2() {
339 let mut counts = Frequencies::new();
340 counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
341 let (most_count, most_total) = counts.par_frequent(false);
342 assert_eq!(most_count[0], (&2, 5));
343 assert_eq!(most_total, 11);
344 let (least_count, least_total) = counts.par_frequent(true);
345 assert_eq!(least_count[0], (&3, 1));
346 assert_eq!(least_total, 11);
347 }
348
349 #[test]
350 fn unique_values() {
351 let freqs = Frequencies::from_iter(vec![8, 6, 5, 1, 1, 2, 2, 2, 3, 4, 7, 4, 4]);
352 let mut unique: Vec<isize> = freqs.unique_values().copied().collect();
353 unique.sort_unstable();
354 assert_eq!(unique, vec![1, 2, 3, 4, 5, 6, 7, 8]);
355 }
356
357 #[test]
358 fn test_top_n() {
359 let mut freq = Frequencies::new();
360 freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
361
362 let top_2 = freq.top_n(2);
363 assert_eq!(top_2.len(), 2);
364 assert_eq!(top_2[0], (&4, 4)); assert_eq!(top_2[1], (&1, 3)); let bottom_2 = freq.bottom_n(2);
368 assert_eq!(bottom_2.len(), 2);
369 assert_eq!(bottom_2[0], (&3, 1)); assert_eq!(bottom_2[1], (&2, 2)); }
372
373 #[test]
374 fn test_count_methods() {
375 let mut freq = Frequencies::new();
376 freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
377
378 assert_eq!(freq.total_count(), 10);
380
381 assert!(freq.has_count(3)); assert!(freq.has_count(4)); assert!(freq.has_count(1)); assert!(!freq.has_count(5)); let items_with_3 = freq.items_with_count(3);
389 assert_eq!(items_with_3, vec![&1]); let items_with_2 = freq.items_with_count(2);
392 assert_eq!(items_with_2, vec![&2]); let items_with_1 = freq.items_with_count(1);
395 assert_eq!(items_with_1, vec![&3]); let items_with_4 = freq.items_with_count(4);
398 assert_eq!(items_with_4, vec![&4]); let items_with_5 = freq.items_with_count(5);
401 assert!(items_with_5.is_empty()); }
403}