1use foldhash::{HashMap, HashMapExt};
2use std::collections::hash_map::{Entry, Keys};
3use std::hash::Hash;
4
5use rayon::prelude::*;
6
7use crate::Commute;
8#[derive(Clone)]
10pub struct Frequencies<T> {
11 data: HashMap<T, u64>,
12}
13
14#[cfg(debug_assertions)]
15impl<T: std::fmt::Debug + Eq + Hash> std::fmt::Debug for Frequencies<T> {
16 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
17 write!(f, "{:?}", self.data)
18 }
19}
20
21impl<T: Eq + Hash> Frequencies<T> {
22 #[must_use]
24 pub fn new() -> Frequencies<T> {
25 Default::default()
26 }
27
28 #[must_use]
30 pub fn with_capacity(capacity: usize) -> Self {
31 Frequencies {
32 data: HashMap::with_capacity(capacity),
33 }
34 }
35
36 #[inline]
38 pub fn add(&mut self, v: T) {
39 *self.data.entry(v).or_insert(0) += 1;
40 }
41
42 #[inline]
44 #[must_use]
45 pub fn count(&self, v: &T) -> u64 {
46 self.data.get(v).copied().unwrap_or(0)
47 }
48
49 #[inline]
51 #[must_use]
52 pub fn cardinality(&self) -> u64 {
53 self.len() as u64
54 }
55
56 #[inline]
59 #[must_use]
60 pub fn most_frequent(&self) -> (Vec<(&T, u64)>, u64) {
61 let len = self.data.len();
62 let total_count: u64 = self.data.values().sum();
63 let mut counts = Vec::with_capacity(len);
64
65 for (k, &v) in &self.data {
66 counts.push((k, v));
67 }
68 counts.sort_unstable_by(|&(_, c1), &(_, c2)| c2.cmp(&c1));
69 (counts, total_count)
70 }
71
72 #[inline]
75 #[must_use]
76 pub fn least_frequent(&self) -> (Vec<(&T, u64)>, u64) {
77 let len = self.data.len();
78 let total_count: u64 = self.data.values().sum();
79 let mut counts = Vec::with_capacity(len);
80
81 for (k, &v) in &self.data {
82 counts.push((k, v));
83 }
84 counts.sort_unstable_by(|&(_, c1), &(_, c2)| c1.cmp(&c2));
85 (counts, total_count)
86 }
87
88 #[inline]
91 #[must_use]
92 pub fn par_frequent(&self, least: bool) -> (Vec<(&T, u64)>, u64)
93 where
94 for<'a> (&'a T, u64): Send,
95 T: Ord,
96 {
97 let len = self.data.len();
98 let total_count: u64 = self.data.values().sum();
99 let mut counts = Vec::with_capacity(len);
100
101 for (k, &v) in &self.data {
102 counts.push((k, v));
103 }
104 if least {
108 counts.par_sort_unstable_by(|&(v1, c1), &(v2, c2)| {
110 let cmp = c1.cmp(&c2);
111 if cmp == std::cmp::Ordering::Equal {
112 v1.cmp(v2)
113 } else {
114 cmp
115 }
116 });
117 } else {
118 counts
120 .par_sort_unstable_by(|&(v1, c1), &(v2, c2)| c2.cmp(&c1).then_with(|| v1.cmp(v2)));
121 }
122 (counts, total_count)
123 }
124
125 #[must_use]
127 pub fn len(&self) -> usize {
128 self.data.len()
129 }
130
131 #[must_use]
133 pub fn is_empty(&self) -> bool {
134 self.data.is_empty()
135 }
136
137 #[must_use]
139 pub fn unique_values(&self) -> UniqueValues<'_, T> {
140 UniqueValues {
141 data_keys: self.data.keys(),
142 }
143 }
144
145 #[must_use]
148 pub fn top_n(&self, n: usize) -> Vec<(&T, u64)>
149 where
150 T: Ord,
151 {
152 use std::collections::BinaryHeap;
153
154 let mut heap = BinaryHeap::with_capacity(n + 1);
156
157 for (item, count) in &self.data {
158 heap.push(std::cmp::Reverse((*count, item)));
161
162 if heap.len() > n {
164 heap.pop();
165 }
166 }
167
168 heap.into_sorted_vec()
170 .into_iter()
171 .map(|std::cmp::Reverse((count, item))| (item, count))
172 .collect()
173 }
174
175 #[must_use]
177 pub fn bottom_n(&self, n: usize) -> Vec<(&T, u64)>
178 where
179 T: Ord,
180 {
181 use std::collections::BinaryHeap;
182
183 let mut heap = BinaryHeap::with_capacity(n + 1);
184
185 for (item, count) in &self.data {
186 heap.push((*count, item));
187 if heap.len() > n {
188 heap.pop();
189 }
190 }
191
192 heap.into_sorted_vec()
193 .into_iter()
194 .map(|(count, item)| (item, count))
195 .collect()
196 }
197
198 #[must_use]
200 pub fn items_with_count(&self, n: u64) -> Vec<&T> {
201 self.data
202 .iter()
203 .filter(|&(_, &count)| count == n)
204 .map(|(item, _)| item)
205 .collect()
206 }
207
208 #[must_use]
210 pub fn total_count(&self) -> u64 {
211 self.data.values().sum()
212 }
213
214 #[must_use]
216 pub fn has_count(&self, n: u64) -> bool {
217 self.data.values().any(|&count| count == n)
218 }
219
220 #[inline]
222 pub fn increment_by(&mut self, v: T, count: u64) {
223 match self.data.entry(v) {
224 Entry::Vacant(entry) => {
225 entry.insert(count);
226 }
227 Entry::Occupied(mut entry) => {
228 *entry.get_mut() += count;
229 }
230 }
231 }
232}
233
234impl<T: Eq + Hash> Commute for Frequencies<T> {
235 #[inline]
236 fn merge(&mut self, v: Frequencies<T>) {
237 self.data.reserve(v.data.len());
239
240 for (k, v2) in v.data {
241 match self.data.entry(k) {
242 Entry::Vacant(v1) => {
243 v1.insert(v2);
244 }
245 Entry::Occupied(mut v1) => {
246 *v1.get_mut() += v2;
247 }
248 }
249 }
250 }
251}
252
253impl<T: Eq + Hash> Default for Frequencies<T> {
254 #[inline]
255 fn default() -> Frequencies<T> {
256 Frequencies {
257 data: HashMap::with_capacity(1_000),
258 }
259 }
260}
261
262impl<T: Eq + Hash> FromIterator<T> for Frequencies<T> {
263 #[inline]
264 fn from_iter<I: IntoIterator<Item = T>>(it: I) -> Frequencies<T> {
265 let mut v = Frequencies::new();
266 v.extend(it);
267 v
268 }
269}
270
271impl<T: Eq + Hash> Extend<T> for Frequencies<T> {
272 #[inline]
273 fn extend<I: IntoIterator<Item = T>>(&mut self, it: I) {
274 let iter = it.into_iter();
275 if let (lower, Some(upper)) = iter.size_hint()
277 && lower == upper
278 {
279 self.data.reserve(lower.saturating_sub(self.data.len()));
282 }
283 for sample in iter {
284 self.add(sample);
285 }
286 }
287}
288
289pub struct UniqueValues<'a, K> {
291 data_keys: Keys<'a, K, u64>,
292}
293
294impl<'a, K> Iterator for UniqueValues<'a, K> {
295 type Item = &'a K;
296 fn next(&mut self) -> Option<Self::Item> {
297 self.data_keys.next()
298 }
299}
300
301#[cfg(test)]
302mod test {
303 use super::Frequencies;
304 use std::iter::FromIterator;
305
306 #[test]
307 fn ranked() {
308 let mut counts = Frequencies::new();
309 counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
310 let (most_count, most_total) = counts.most_frequent();
311 assert_eq!(most_count[0], (&2, 5));
312 assert_eq!(most_total, 11);
313 let (least_count, least_total) = counts.least_frequent();
314 assert_eq!(least_count[0], (&3, 1));
315 assert_eq!(least_total, 11);
316 assert_eq!(
317 counts.least_frequent(),
318 (vec![(&3, 1), (&1, 2), (&4, 3), (&2, 5)], 11)
319 );
320 }
321
322 #[test]
323 fn ranked2() {
324 let mut counts = Frequencies::new();
325 counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
326 let (most_count, most_total) = counts.par_frequent(false);
327 assert_eq!(most_count[0], (&2, 5));
328 assert_eq!(most_total, 11);
329 let (least_count, least_total) = counts.par_frequent(true);
330 assert_eq!(least_count[0], (&3, 1));
331 assert_eq!(least_total, 11);
332 }
333
334 #[test]
335 fn unique_values() {
336 let freqs = Frequencies::from_iter(vec![8, 6, 5, 1, 1, 2, 2, 2, 3, 4, 7, 4, 4]);
337 let mut unique: Vec<isize> = freqs.unique_values().copied().collect();
338 unique.sort_unstable();
339 assert_eq!(unique, vec![1, 2, 3, 4, 5, 6, 7, 8]);
340 }
341
342 #[test]
343 fn test_top_n() {
344 let mut freq = Frequencies::new();
345 freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
346
347 let top_2 = freq.top_n(2);
348 assert_eq!(top_2.len(), 2);
349 assert_eq!(top_2[0], (&4, 4)); assert_eq!(top_2[1], (&1, 3)); let bottom_2 = freq.bottom_n(2);
353 assert_eq!(bottom_2.len(), 2);
354 assert_eq!(bottom_2[0], (&3, 1)); assert_eq!(bottom_2[1], (&2, 2)); }
357
358 #[test]
359 fn test_count_methods() {
360 let mut freq = Frequencies::new();
361 freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
362
363 assert_eq!(freq.total_count(), 10);
365
366 assert!(freq.has_count(3)); assert!(freq.has_count(4)); assert!(freq.has_count(1)); assert!(!freq.has_count(5)); let items_with_3 = freq.items_with_count(3);
374 assert_eq!(items_with_3, vec![&1]); let items_with_2 = freq.items_with_count(2);
377 assert_eq!(items_with_2, vec![&2]); let items_with_1 = freq.items_with_count(1);
380 assert_eq!(items_with_1, vec![&3]); let items_with_4 = freq.items_with_count(4);
383 assert_eq!(items_with_4, vec![&4]); let items_with_5 = freq.items_with_count(5);
386 assert!(items_with_5.is_empty()); }
388}