1use std::hash::Hash;
2
3use hashbrown::HashMap;
4use hashbrown::hash_map::{Entry, Keys};
5
6use rayon::prelude::*;
7
8use crate::Commute;
9
10const PARALLEL_THRESHOLD: usize = 10_000;
11#[derive(Clone)]
13pub struct Frequencies<T> {
14 data: HashMap<T, u64>,
15}
16
17#[cfg(debug_assertions)]
18impl<T: std::fmt::Debug + Eq + Hash> std::fmt::Debug for Frequencies<T> {
19 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
20 write!(f, "{:?}", self.data)
21 }
22}
23
24impl<T: Eq + Hash> Frequencies<T> {
25 #[must_use]
27 pub fn new() -> Frequencies<T> {
28 Default::default()
29 }
30
31 #[must_use]
33 pub fn with_capacity(capacity: usize) -> Self {
34 Frequencies {
35 data: HashMap::with_capacity(capacity),
36 }
37 }
38
39 #[allow(clippy::inline_always)]
41 #[inline(always)]
42 pub fn add(&mut self, v: T) {
43 *self.data.entry(v).or_insert(0) += 1;
44 }
45
46 #[inline]
48 #[must_use]
49 pub fn count(&self, v: &T) -> u64 {
50 self.data.get(v).copied().unwrap_or(0)
51 }
52
53 #[inline]
55 #[must_use]
56 pub fn cardinality(&self) -> u64 {
57 self.len() as u64
58 }
59
60 fn collect_counts(&self) -> (Vec<(&T, u64)>, u64) {
62 let mut total_count = 0u64;
63 let counts: Vec<(&T, u64)> = self
64 .data
65 .iter()
66 .map(|(k, &v)| {
67 total_count += v;
68 (k, v)
69 })
70 .collect();
71 (counts, total_count)
72 }
73
74 #[inline]
77 #[must_use]
78 pub fn most_frequent(&self) -> (Vec<(&T, u64)>, u64) {
79 let (mut counts, total_count) = self.collect_counts();
80 counts.sort_unstable_by_key(|&(_, c)| std::cmp::Reverse(c));
81 (counts, total_count)
82 }
83
84 #[inline]
87 #[must_use]
88 pub fn least_frequent(&self) -> (Vec<(&T, u64)>, u64) {
89 let (mut counts, total_count) = self.collect_counts();
90 counts.sort_unstable_by_key(|&(_, c)| c);
91 (counts, total_count)
92 }
93
94 #[inline]
97 #[must_use]
98 pub fn par_frequent(&self, least: bool) -> (Vec<(&T, u64)>, u64)
99 where
100 for<'a> (&'a T, u64): Send,
101 T: Ord,
102 {
103 let (mut counts, total_count) = self.collect_counts();
104 if least {
109 let sort_fn =
111 |&(v1, c1): &(&T, u64), &(v2, c2): &(&T, u64)| c1.cmp(&c2).then_with(|| v1.cmp(v2));
112 if counts.len() < PARALLEL_THRESHOLD {
113 counts.sort_unstable_by(sort_fn);
114 } else {
115 counts.par_sort_unstable_by(sort_fn);
116 }
117 } else {
118 let sort_fn =
120 |&(v1, c1): &(&T, u64), &(v2, c2): &(&T, u64)| c2.cmp(&c1).then_with(|| v1.cmp(v2));
121 if counts.len() < PARALLEL_THRESHOLD {
122 counts.sort_unstable_by(sort_fn);
123 } else {
124 counts.par_sort_unstable_by(sort_fn);
125 }
126 }
127 (counts, total_count)
128 }
129
130 #[must_use]
132 pub fn len(&self) -> usize {
133 self.data.len()
134 }
135
136 #[must_use]
138 pub fn is_empty(&self) -> bool {
139 self.data.is_empty()
140 }
141
142 #[must_use]
144 pub fn unique_values(&self) -> UniqueValues<'_, T> {
145 UniqueValues {
146 data_keys: self.data.keys(),
147 }
148 }
149
150 #[must_use]
153 pub fn top_n(&self, n: usize) -> Vec<(&T, u64)>
154 where
155 T: Ord,
156 {
157 use std::collections::BinaryHeap;
158
159 let mut heap = BinaryHeap::with_capacity(n);
170
171 for (item, count) in &self.data {
172 let candidate = (*count, std::cmp::Reverse(item));
173 if heap.len() < n {
174 heap.push(std::cmp::Reverse(candidate));
175 } else if let Some(top) = heap.peek()
176 && candidate > top.0
177 {
178 heap.pop();
179 heap.push(std::cmp::Reverse(candidate));
180 }
181 }
182
183 heap.into_sorted_vec()
185 .into_iter()
186 .map(|std::cmp::Reverse((count, std::cmp::Reverse(item)))| (item, count))
187 .collect()
188 }
189
190 #[must_use]
192 pub fn bottom_n(&self, n: usize) -> Vec<(&T, u64)>
193 where
194 T: Ord,
195 {
196 use std::collections::BinaryHeap;
197
198 let mut heap = BinaryHeap::with_capacity(n);
200
201 for (item, count) in &self.data {
202 let candidate = (*count, item);
203 if heap.len() < n {
204 heap.push(candidate);
205 } else if let Some(top) = heap.peek()
206 && candidate < *top
207 {
208 heap.pop();
209 heap.push(candidate);
210 }
211 }
212
213 heap.into_sorted_vec()
214 .into_iter()
215 .map(|(count, item)| (item, count))
216 .collect()
217 }
218
219 #[must_use]
221 pub fn items_with_count(&self, n: u64) -> Vec<&T> {
222 self.data
223 .iter()
224 .filter(|&(_, &count)| count == n)
225 .map(|(item, _)| item)
226 .collect()
227 }
228
229 #[must_use]
231 pub fn total_count(&self) -> u64 {
232 self.data.values().sum()
233 }
234
235 #[must_use]
237 pub fn has_count(&self, n: u64) -> bool {
238 self.data.values().any(|&count| count == n)
239 }
240
241 #[allow(clippy::inline_always)]
243 #[inline(always)]
244 pub fn increment_by(&mut self, v: T, count: u64) {
245 match self.data.entry(v) {
246 Entry::Vacant(entry) => {
247 entry.insert(count);
248 }
249 Entry::Occupied(mut entry) => {
250 *entry.get_mut() += count;
251 }
252 }
253 }
254}
255
256impl Frequencies<Vec<u8>> {
257 #[allow(clippy::inline_always)]
264 #[inline(always)]
265 pub fn add_borrowed(&mut self, v: &[u8]) {
266 *self.data.entry_ref(v).or_insert(0) += 1;
267 }
268
269 #[allow(clippy::inline_always)]
271 #[inline(always)]
272 pub fn increment_by_borrowed(&mut self, v: &[u8], count: u64) {
273 *self.data.entry_ref(v).or_insert(0) += count;
274 }
275}
276
277impl<T: Eq + Hash> Commute for Frequencies<T> {
278 #[inline]
279 fn merge(&mut self, v: Frequencies<T>) {
280 self.data.reserve(v.data.len());
282
283 for (k, v2) in v.data {
284 match self.data.entry(k) {
285 Entry::Vacant(v1) => {
286 v1.insert(v2);
287 }
288 Entry::Occupied(mut v1) => {
289 *v1.get_mut() += v2;
290 }
291 }
292 }
293 }
294}
295
296impl<T: Eq + Hash> Default for Frequencies<T> {
297 #[inline]
298 fn default() -> Frequencies<T> {
299 Frequencies {
300 data: HashMap::with_capacity(64),
301 }
302 }
303}
304
305impl<T: Eq + Hash> FromIterator<T> for Frequencies<T> {
306 #[inline]
307 fn from_iter<I: IntoIterator<Item = T>>(it: I) -> Frequencies<T> {
308 let mut v = Frequencies::new();
309 v.extend(it);
310 v
311 }
312}
313
314impl<T: Eq + Hash> Extend<T> for Frequencies<T> {
315 #[inline]
316 fn extend<I: IntoIterator<Item = T>>(&mut self, it: I) {
317 let iter = it.into_iter();
318 if let (lower, Some(upper)) = iter.size_hint()
320 && lower == upper
321 {
322 self.data.reserve(lower.saturating_sub(self.data.len()));
325 }
326 for sample in iter {
327 self.add(sample);
328 }
329 }
330}
331
332pub struct UniqueValues<'a, K> {
334 data_keys: Keys<'a, K, u64>,
335}
336
337impl<'a, K> Iterator for UniqueValues<'a, K> {
338 type Item = &'a K;
339 #[inline]
340 fn next(&mut self) -> Option<Self::Item> {
341 self.data_keys.next()
342 }
343
344 #[inline]
347 fn size_hint(&self) -> (usize, Option<usize>) {
348 self.data_keys.size_hint()
349 }
350}
351
352impl<K> ExactSizeIterator for UniqueValues<'_, K> {
353 #[inline]
354 fn len(&self) -> usize {
355 self.data_keys.len()
356 }
357}
358
359impl<K> std::iter::FusedIterator for UniqueValues<'_, K> {}
360
361#[cfg(test)]
362mod test {
363 use super::Frequencies;
364 use std::iter::FromIterator;
365
366 #[test]
367 fn ranked() {
368 let mut counts = Frequencies::new();
369 counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
370 let (most_count, most_total) = counts.most_frequent();
371 assert_eq!(most_count[0], (&2, 5));
372 assert_eq!(most_total, 11);
373 let (least_count, least_total) = counts.least_frequent();
374 assert_eq!(least_count[0], (&3, 1));
375 assert_eq!(least_total, 11);
376 assert_eq!(
377 counts.least_frequent(),
378 (vec![(&3, 1), (&1, 2), (&4, 3), (&2, 5)], 11)
379 );
380 }
381
382 #[test]
383 fn ranked2() {
384 let mut counts = Frequencies::new();
385 counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
386 let (most_count, most_total) = counts.par_frequent(false);
387 assert_eq!(most_count[0], (&2, 5));
388 assert_eq!(most_total, 11);
389 let (least_count, least_total) = counts.par_frequent(true);
390 assert_eq!(least_count[0], (&3, 1));
391 assert_eq!(least_total, 11);
392 }
393
394 #[test]
395 fn unique_values() {
396 let freqs = Frequencies::from_iter(vec![8, 6, 5, 1, 1, 2, 2, 2, 3, 4, 7, 4, 4]);
397 let mut unique: Vec<isize> = freqs.unique_values().copied().collect();
398 unique.sort_unstable();
399 assert_eq!(unique, vec![1, 2, 3, 4, 5, 6, 7, 8]);
400 }
401
402 #[test]
403 fn test_top_n() {
404 let mut freq = Frequencies::new();
405 freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
406
407 let top_2 = freq.top_n(2);
408 assert_eq!(top_2.len(), 2);
409 assert_eq!(top_2[0], (&4, 4)); assert_eq!(top_2[1], (&1, 3)); let bottom_2 = freq.bottom_n(2);
413 assert_eq!(bottom_2.len(), 2);
414 assert_eq!(bottom_2[0], (&3, 1)); assert_eq!(bottom_2[1], (&2, 2)); }
417
418 #[test]
423 fn top_n_matches_par_frequent_all_ties() {
424 let mut freq = Frequencies::new();
426 freq.extend(vec![1usize, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
427 for n in 0..=10usize {
428 let (full_desc, _) = freq.par_frequent(false);
429 let expected_desc: Vec<(&usize, u64)> = full_desc.into_iter().take(n).collect();
430 assert_eq!(freq.top_n(n), expected_desc, "top_n({n}) all-ties mismatch");
431
432 let (full_asc, _) = freq.par_frequent(true);
433 let expected_asc: Vec<(&usize, u64)> = full_asc.into_iter().take(n).collect();
434 assert_eq!(
435 freq.bottom_n(n),
436 expected_asc,
437 "bottom_n({n}) all-ties mismatch"
438 );
439 }
440 }
441
442 #[test]
443 fn top_n_matches_par_frequent_boundary_ties() {
444 let mut freq = Frequencies::new();
447 for v in 10..20usize {
448 freq.extend(vec![v, v]); }
450 for v in 20..25usize {
451 freq.extend(vec![v; 5]); }
453 for v in 30..35usize {
454 freq.extend(vec![v]); }
456 for n in 0..=freq.len() + 2 {
457 let (full_desc, _) = freq.par_frequent(false);
458 let expected_desc: Vec<(&usize, u64)> = full_desc.into_iter().take(n).collect();
459 assert_eq!(
460 freq.top_n(n),
461 expected_desc,
462 "top_n({n}) boundary-tie mismatch"
463 );
464
465 let (full_asc, _) = freq.par_frequent(true);
466 let expected_asc: Vec<(&usize, u64)> = full_asc.into_iter().take(n).collect();
467 assert_eq!(
468 freq.bottom_n(n),
469 expected_asc,
470 "bottom_n({n}) boundary-tie mismatch"
471 );
472 }
473 }
474
475 #[test]
476 fn test_count_methods() {
477 let mut freq = Frequencies::new();
478 freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
479
480 assert_eq!(freq.total_count(), 10);
482
483 assert!(freq.has_count(3)); assert!(freq.has_count(4)); assert!(freq.has_count(1)); assert!(!freq.has_count(5)); let items_with_3 = freq.items_with_count(3);
491 assert_eq!(items_with_3, vec![&1]); let items_with_2 = freq.items_with_count(2);
494 assert_eq!(items_with_2, vec![&2]); let items_with_1 = freq.items_with_count(1);
497 assert_eq!(items_with_1, vec![&3]); let items_with_4 = freq.items_with_count(4);
500 assert_eq!(items_with_4, vec![&4]); let items_with_5 = freq.items_with_count(5);
503 assert!(items_with_5.is_empty()); }
505
506 #[test]
507 fn add_borrowed_inserts_new_key() {
508 let mut freq = Frequencies::<Vec<u8>>::new();
509 freq.add_borrowed(b"hello");
510 assert_eq!(freq.count(&b"hello".to_vec()), 1);
511 assert_eq!(freq.cardinality(), 1);
512 }
513
514 #[test]
515 fn add_borrowed_increments_existing_key() {
516 let mut freq = Frequencies::<Vec<u8>>::new();
517 freq.add_borrowed(b"hello");
518 freq.add_borrowed(b"hello");
519 freq.add_borrowed(b"hello");
520 assert_eq!(freq.count(&b"hello".to_vec()), 3);
521 assert_eq!(freq.cardinality(), 1);
522
523 freq.increment_by_borrowed(b"world", 5);
525 assert_eq!(freq.count(&b"world".to_vec()), 5);
526 freq.increment_by_borrowed(b"world", 3);
527 assert_eq!(freq.count(&b"world".to_vec()), 8);
528 }
529
530 #[test]
531 fn borrowed_owned_interop_for_same_key() {
532 let mut freq = Frequencies::<Vec<u8>>::new();
533 freq.add(b"key".to_vec());
535 freq.add_borrowed(b"key");
537 freq.increment_by_borrowed(b"key", 3);
538 assert_eq!(freq.count(&b"key".to_vec()), 5);
540 assert_eq!(freq.cardinality(), 1);
541 }
542}