1use std::collections::hash_map::{Entry, Keys};
2use std::hash::Hash;
3
4use foldhash::{HashMap, HashMapExt};
5
6use rayon::prelude::*;
7
8use crate::Commute;
9
10const PARALLEL_THRESHOLD: usize = 10_000;
11#[derive(Clone)]
13pub struct Frequencies<T> {
14 data: HashMap<T, u64>,
15}
16
17#[cfg(debug_assertions)]
18impl<T: std::fmt::Debug + Eq + Hash> std::fmt::Debug for Frequencies<T> {
19 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
20 write!(f, "{:?}", self.data)
21 }
22}
23
24impl<T: Eq + Hash> Frequencies<T> {
25 #[must_use]
27 pub fn new() -> Frequencies<T> {
28 Default::default()
29 }
30
31 #[must_use]
33 pub fn with_capacity(capacity: usize) -> Self {
34 Frequencies {
35 data: HashMap::with_capacity(capacity),
36 }
37 }
38
39 #[inline]
41 pub fn add(&mut self, v: T) {
42 *self.data.entry(v).or_insert(0) += 1;
43 }
44
45 #[inline]
47 #[must_use]
48 pub fn count(&self, v: &T) -> u64 {
49 self.data.get(v).copied().unwrap_or(0)
50 }
51
52 #[inline]
54 #[must_use]
55 pub fn cardinality(&self) -> u64 {
56 self.len() as u64
57 }
58
59 fn collect_counts(&self) -> (Vec<(&T, u64)>, u64) {
61 let mut total_count = 0u64;
62 let counts: Vec<(&T, u64)> = self
63 .data
64 .iter()
65 .map(|(k, &v)| {
66 total_count += v;
67 (k, v)
68 })
69 .collect();
70 (counts, total_count)
71 }
72
73 #[inline]
76 #[must_use]
77 pub fn most_frequent(&self) -> (Vec<(&T, u64)>, u64) {
78 let (mut counts, total_count) = self.collect_counts();
79 counts.sort_unstable_by_key(|&(_, c)| std::cmp::Reverse(c));
80 (counts, total_count)
81 }
82
83 #[inline]
86 #[must_use]
87 pub fn least_frequent(&self) -> (Vec<(&T, u64)>, u64) {
88 let (mut counts, total_count) = self.collect_counts();
89 counts.sort_unstable_by_key(|&(_, c)| c);
90 (counts, total_count)
91 }
92
93 #[inline]
96 #[must_use]
97 pub fn par_frequent(&self, least: bool) -> (Vec<(&T, u64)>, u64)
98 where
99 for<'a> (&'a T, u64): Send,
100 T: Ord,
101 {
102 let (mut counts, total_count) = self.collect_counts();
103 if least {
108 let sort_fn =
110 |&(v1, c1): &(&T, u64), &(v2, c2): &(&T, u64)| c1.cmp(&c2).then_with(|| v1.cmp(v2));
111 if counts.len() < PARALLEL_THRESHOLD {
112 counts.sort_unstable_by(sort_fn);
113 } else {
114 counts.par_sort_unstable_by(sort_fn);
115 }
116 } else {
117 let sort_fn =
119 |&(v1, c1): &(&T, u64), &(v2, c2): &(&T, u64)| c2.cmp(&c1).then_with(|| v1.cmp(v2));
120 if counts.len() < PARALLEL_THRESHOLD {
121 counts.sort_unstable_by(sort_fn);
122 } else {
123 counts.par_sort_unstable_by(sort_fn);
124 }
125 }
126 (counts, total_count)
127 }
128
129 #[must_use]
131 pub fn len(&self) -> usize {
132 self.data.len()
133 }
134
135 #[must_use]
137 pub fn is_empty(&self) -> bool {
138 self.data.is_empty()
139 }
140
141 #[must_use]
143 pub fn unique_values(&self) -> UniqueValues<'_, T> {
144 UniqueValues {
145 data_keys: self.data.keys(),
146 }
147 }
148
149 #[must_use]
152 pub fn top_n(&self, n: usize) -> Vec<(&T, u64)>
153 where
154 T: Ord,
155 {
156 use std::collections::BinaryHeap;
157
158 let mut heap = BinaryHeap::with_capacity(n);
163
164 for (item, count) in &self.data {
165 let candidate = (*count, item);
166 if heap.len() < n {
167 heap.push(std::cmp::Reverse(candidate));
168 } else if let Some(top) = heap.peek()
169 && candidate > top.0
170 {
171 heap.pop();
172 heap.push(std::cmp::Reverse(candidate));
173 }
174 }
175
176 heap.into_sorted_vec()
178 .into_iter()
179 .map(|std::cmp::Reverse((count, item))| (item, count))
180 .collect()
181 }
182
183 #[must_use]
185 pub fn bottom_n(&self, n: usize) -> Vec<(&T, u64)>
186 where
187 T: Ord,
188 {
189 use std::collections::BinaryHeap;
190
191 let mut heap = BinaryHeap::with_capacity(n);
193
194 for (item, count) in &self.data {
195 let candidate = (*count, item);
196 if heap.len() < n {
197 heap.push(candidate);
198 } else if let Some(top) = heap.peek()
199 && candidate < *top
200 {
201 heap.pop();
202 heap.push(candidate);
203 }
204 }
205
206 heap.into_sorted_vec()
207 .into_iter()
208 .map(|(count, item)| (item, count))
209 .collect()
210 }
211
212 #[must_use]
214 pub fn items_with_count(&self, n: u64) -> Vec<&T> {
215 self.data
216 .iter()
217 .filter(|&(_, &count)| count == n)
218 .map(|(item, _)| item)
219 .collect()
220 }
221
222 #[must_use]
224 pub fn total_count(&self) -> u64 {
225 self.data.values().sum()
226 }
227
228 #[must_use]
230 pub fn has_count(&self, n: u64) -> bool {
231 self.data.values().any(|&count| count == n)
232 }
233
234 #[inline]
236 pub fn increment_by(&mut self, v: T, count: u64) {
237 match self.data.entry(v) {
238 Entry::Vacant(entry) => {
239 entry.insert(count);
240 }
241 Entry::Occupied(mut entry) => {
242 *entry.get_mut() += count;
243 }
244 }
245 }
246}
247
248impl Frequencies<Vec<u8>> {
249 #[inline]
254 pub fn add_borrowed(&mut self, v: &[u8]) {
255 if let Some(count) = self.data.get_mut(v) {
256 *count += 1;
257 } else {
258 self.data.insert(v.to_vec(), 1);
259 }
260 }
261
262 #[inline]
264 pub fn increment_by_borrowed(&mut self, v: &[u8], count: u64) {
265 if let Some(existing) = self.data.get_mut(v) {
266 *existing += count;
267 } else {
268 self.data.insert(v.to_vec(), count);
269 }
270 }
271}
272
273impl<T: Eq + Hash> Commute for Frequencies<T> {
274 #[inline]
275 fn merge(&mut self, v: Frequencies<T>) {
276 self.data.reserve(v.data.len());
278
279 for (k, v2) in v.data {
280 match self.data.entry(k) {
281 Entry::Vacant(v1) => {
282 v1.insert(v2);
283 }
284 Entry::Occupied(mut v1) => {
285 *v1.get_mut() += v2;
286 }
287 }
288 }
289 }
290}
291
292impl<T: Eq + Hash> Default for Frequencies<T> {
293 #[inline]
294 fn default() -> Frequencies<T> {
295 Frequencies {
296 data: HashMap::with_capacity(64),
297 }
298 }
299}
300
301impl<T: Eq + Hash> FromIterator<T> for Frequencies<T> {
302 #[inline]
303 fn from_iter<I: IntoIterator<Item = T>>(it: I) -> Frequencies<T> {
304 let mut v = Frequencies::new();
305 v.extend(it);
306 v
307 }
308}
309
310impl<T: Eq + Hash> Extend<T> for Frequencies<T> {
311 #[inline]
312 fn extend<I: IntoIterator<Item = T>>(&mut self, it: I) {
313 let iter = it.into_iter();
314 if let (lower, Some(upper)) = iter.size_hint()
316 && lower == upper
317 {
318 self.data.reserve(lower.saturating_sub(self.data.len()));
321 }
322 for sample in iter {
323 self.add(sample);
324 }
325 }
326}
327
328pub struct UniqueValues<'a, K> {
330 data_keys: Keys<'a, K, u64>,
331}
332
333impl<'a, K> Iterator for UniqueValues<'a, K> {
334 type Item = &'a K;
335 fn next(&mut self) -> Option<Self::Item> {
336 self.data_keys.next()
337 }
338}
339
340#[cfg(test)]
341mod test {
342 use super::Frequencies;
343 use std::iter::FromIterator;
344
345 #[test]
346 fn ranked() {
347 let mut counts = Frequencies::new();
348 counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
349 let (most_count, most_total) = counts.most_frequent();
350 assert_eq!(most_count[0], (&2, 5));
351 assert_eq!(most_total, 11);
352 let (least_count, least_total) = counts.least_frequent();
353 assert_eq!(least_count[0], (&3, 1));
354 assert_eq!(least_total, 11);
355 assert_eq!(
356 counts.least_frequent(),
357 (vec![(&3, 1), (&1, 2), (&4, 3), (&2, 5)], 11)
358 );
359 }
360
361 #[test]
362 fn ranked2() {
363 let mut counts = Frequencies::new();
364 counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
365 let (most_count, most_total) = counts.par_frequent(false);
366 assert_eq!(most_count[0], (&2, 5));
367 assert_eq!(most_total, 11);
368 let (least_count, least_total) = counts.par_frequent(true);
369 assert_eq!(least_count[0], (&3, 1));
370 assert_eq!(least_total, 11);
371 }
372
373 #[test]
374 fn unique_values() {
375 let freqs = Frequencies::from_iter(vec![8, 6, 5, 1, 1, 2, 2, 2, 3, 4, 7, 4, 4]);
376 let mut unique: Vec<isize> = freqs.unique_values().copied().collect();
377 unique.sort_unstable();
378 assert_eq!(unique, vec![1, 2, 3, 4, 5, 6, 7, 8]);
379 }
380
381 #[test]
382 fn test_top_n() {
383 let mut freq = Frequencies::new();
384 freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
385
386 let top_2 = freq.top_n(2);
387 assert_eq!(top_2.len(), 2);
388 assert_eq!(top_2[0], (&4, 4)); assert_eq!(top_2[1], (&1, 3)); let bottom_2 = freq.bottom_n(2);
392 assert_eq!(bottom_2.len(), 2);
393 assert_eq!(bottom_2[0], (&3, 1)); assert_eq!(bottom_2[1], (&2, 2)); }
396
397 #[test]
398 fn test_count_methods() {
399 let mut freq = Frequencies::new();
400 freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
401
402 assert_eq!(freq.total_count(), 10);
404
405 assert!(freq.has_count(3)); assert!(freq.has_count(4)); assert!(freq.has_count(1)); assert!(!freq.has_count(5)); let items_with_3 = freq.items_with_count(3);
413 assert_eq!(items_with_3, vec![&1]); let items_with_2 = freq.items_with_count(2);
416 assert_eq!(items_with_2, vec![&2]); let items_with_1 = freq.items_with_count(1);
419 assert_eq!(items_with_1, vec![&3]); let items_with_4 = freq.items_with_count(4);
422 assert_eq!(items_with_4, vec![&4]); let items_with_5 = freq.items_with_count(5);
425 assert!(items_with_5.is_empty()); }
427
428 #[test]
429 fn add_borrowed_inserts_new_key() {
430 let mut freq = Frequencies::<Vec<u8>>::new();
431 freq.add_borrowed(b"hello");
432 assert_eq!(freq.count(&b"hello".to_vec()), 1);
433 assert_eq!(freq.cardinality(), 1);
434 }
435
436 #[test]
437 fn add_borrowed_increments_existing_key() {
438 let mut freq = Frequencies::<Vec<u8>>::new();
439 freq.add_borrowed(b"hello");
440 freq.add_borrowed(b"hello");
441 freq.add_borrowed(b"hello");
442 assert_eq!(freq.count(&b"hello".to_vec()), 3);
443 assert_eq!(freq.cardinality(), 1);
444
445 freq.increment_by_borrowed(b"world", 5);
447 assert_eq!(freq.count(&b"world".to_vec()), 5);
448 freq.increment_by_borrowed(b"world", 3);
449 assert_eq!(freq.count(&b"world".to_vec()), 8);
450 }
451
452 #[test]
453 fn borrowed_owned_interop_for_same_key() {
454 let mut freq = Frequencies::<Vec<u8>>::new();
455 freq.add(b"key".to_vec());
457 freq.add_borrowed(b"key");
459 freq.increment_by_borrowed(b"key", 3);
460 assert_eq!(freq.count(&b"key".to_vec()), 5);
462 assert_eq!(freq.cardinality(), 1);
463 }
464}