1use std::hash::Hash;
2
3use hashbrown::HashMap;
4use hashbrown::hash_map::{Entry, EntryRef, Keys};
5
6use rayon::prelude::*;
7use serde::{Deserialize, Serialize};
8
9use crate::Commute;
10
11const PARALLEL_THRESHOLD: usize = 10_000;
12#[derive(Clone, Serialize, Deserialize)]
14#[serde(bound(
15 serialize = "T: Serialize + Eq + Hash",
16 deserialize = "T: Deserialize<'de> + Eq + Hash"
17))]
18pub struct Frequencies<T> {
19 data: HashMap<T, u64>,
20}
21
22impl<T: Eq + Hash> PartialEq for Frequencies<T> {
25 fn eq(&self, other: &Self) -> bool {
26 self.data == other.data
27 }
28}
29
30#[cfg(debug_assertions)]
31impl<T: std::fmt::Debug + Eq + Hash> std::fmt::Debug for Frequencies<T> {
32 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
33 write!(f, "{:?}", self.data)
34 }
35}
36
37impl<T: Eq + Hash> Frequencies<T> {
38 #[must_use]
40 pub fn new() -> Frequencies<T> {
41 Default::default()
42 }
43
44 #[must_use]
46 pub fn with_capacity(capacity: usize) -> Self {
47 Frequencies {
48 data: HashMap::with_capacity(capacity),
49 }
50 }
51
52 #[allow(clippy::inline_always)]
54 #[inline(always)]
55 pub fn add(&mut self, v: T) {
56 *self.data.entry(v).or_insert(0) += 1;
57 }
58
59 #[inline]
61 #[must_use]
62 pub fn count(&self, v: &T) -> u64 {
63 self.data.get(v).copied().unwrap_or(0)
64 }
65
66 #[inline]
68 #[must_use]
69 pub fn cardinality(&self) -> u64 {
70 self.len() as u64
71 }
72
73 fn collect_counts(&self) -> (Vec<(&T, u64)>, u64) {
75 let mut total_count = 0u64;
76 let counts: Vec<(&T, u64)> = self
77 .data
78 .iter()
79 .map(|(k, &v)| {
80 total_count += v;
81 (k, v)
82 })
83 .collect();
84 (counts, total_count)
85 }
86
87 #[inline]
90 #[must_use]
91 pub fn most_frequent(&self) -> (Vec<(&T, u64)>, u64) {
92 let (mut counts, total_count) = self.collect_counts();
93 counts.sort_unstable_by_key(|&(_, c)| std::cmp::Reverse(c));
94 (counts, total_count)
95 }
96
97 #[inline]
100 #[must_use]
101 pub fn least_frequent(&self) -> (Vec<(&T, u64)>, u64) {
102 let (mut counts, total_count) = self.collect_counts();
103 counts.sort_unstable_by_key(|&(_, c)| c);
104 (counts, total_count)
105 }
106
107 #[inline]
110 #[must_use]
111 pub fn par_frequent(&self, least: bool) -> (Vec<(&T, u64)>, u64)
112 where
113 for<'a> (&'a T, u64): Send,
114 T: Ord,
115 {
116 let (mut counts, total_count) = self.collect_counts();
117 if least {
122 let sort_fn =
124 |&(v1, c1): &(&T, u64), &(v2, c2): &(&T, u64)| c1.cmp(&c2).then_with(|| v1.cmp(v2));
125 if counts.len() < PARALLEL_THRESHOLD {
126 counts.sort_unstable_by(sort_fn);
127 } else {
128 counts.par_sort_unstable_by(sort_fn);
129 }
130 } else {
131 let sort_fn =
133 |&(v1, c1): &(&T, u64), &(v2, c2): &(&T, u64)| c2.cmp(&c1).then_with(|| v1.cmp(v2));
134 if counts.len() < PARALLEL_THRESHOLD {
135 counts.sort_unstable_by(sort_fn);
136 } else {
137 counts.par_sort_unstable_by(sort_fn);
138 }
139 }
140 (counts, total_count)
141 }
142
143 #[must_use]
145 pub fn len(&self) -> usize {
146 self.data.len()
147 }
148
149 #[must_use]
151 pub fn is_empty(&self) -> bool {
152 self.data.is_empty()
153 }
154
155 #[must_use]
157 pub fn unique_values(&self) -> UniqueValues<'_, T> {
158 UniqueValues {
159 data_keys: self.data.keys(),
160 }
161 }
162
163 #[must_use]
166 pub fn top_n(&self, n: usize) -> Vec<(&T, u64)>
167 where
168 T: Ord,
169 {
170 use std::collections::BinaryHeap;
171
172 let mut heap = BinaryHeap::with_capacity(n);
183
184 for (item, count) in &self.data {
185 let candidate = (*count, std::cmp::Reverse(item));
186 if heap.len() < n {
187 heap.push(std::cmp::Reverse(candidate));
188 } else if let Some(top) = heap.peek()
189 && candidate > top.0
190 {
191 heap.pop();
192 heap.push(std::cmp::Reverse(candidate));
193 }
194 }
195
196 heap.into_sorted_vec()
198 .into_iter()
199 .map(|std::cmp::Reverse((count, std::cmp::Reverse(item)))| (item, count))
200 .collect()
201 }
202
203 #[must_use]
205 pub fn bottom_n(&self, n: usize) -> Vec<(&T, u64)>
206 where
207 T: Ord,
208 {
209 use std::collections::BinaryHeap;
210
211 let mut heap = BinaryHeap::with_capacity(n);
213
214 for (item, count) in &self.data {
215 let candidate = (*count, item);
216 if heap.len() < n {
217 heap.push(candidate);
218 } else if let Some(top) = heap.peek()
219 && candidate < *top
220 {
221 heap.pop();
222 heap.push(candidate);
223 }
224 }
225
226 heap.into_sorted_vec()
227 .into_iter()
228 .map(|(count, item)| (item, count))
229 .collect()
230 }
231
232 #[must_use]
234 pub fn items_with_count(&self, n: u64) -> Vec<&T> {
235 self.data
236 .iter()
237 .filter(|&(_, &count)| count == n)
238 .map(|(item, _)| item)
239 .collect()
240 }
241
242 #[must_use]
244 pub fn total_count(&self) -> u64 {
245 self.data.values().sum()
246 }
247
248 #[must_use]
250 pub fn has_count(&self, n: u64) -> bool {
251 self.data.values().any(|&count| count == n)
252 }
253
254 #[allow(clippy::inline_always)]
256 #[inline(always)]
257 pub fn increment_by(&mut self, v: T, count: u64) {
258 match self.data.entry(v) {
259 Entry::Vacant(entry) => {
260 entry.insert(count);
261 }
262 Entry::Occupied(mut entry) => {
263 *entry.get_mut() += count;
264 }
265 }
266 }
267}
268
269impl<T: Eq + Hash + Ord + Clone + Send + Sync> Frequencies<T> {
270 #[allow(clippy::type_complexity)]
296 #[must_use]
297 pub fn modes_antimodes(&self) -> ((Vec<T>, usize, u32), (Vec<T>, usize, u32)) {
298 if self.data.is_empty() {
299 core::hint::cold_path();
300 return ((Vec::new(), 0, 0), (Vec::new(), 0, 0));
301 }
302
303 let mut highest_count = 1_u64;
306 let mut lowest_count = u64::MAX;
307 for &c in self.data.values() {
308 highest_count = highest_count.max(c);
309 lowest_count = lowest_count.min(c);
310 }
311 let highest_count_u32 = u32::try_from(highest_count).unwrap_or(u32::MAX);
312 let lowest_count_u32 = u32::try_from(lowest_count).unwrap_or(u32::MAX);
313
314 if self.data.len() == 1 {
316 let k = self.data.keys().next().unwrap();
317 return ((vec![k.clone()], 1, lowest_count_u32), (Vec::new(), 0, 0));
318 }
319
320 if highest_count == 1 {
323 let total = self.data.len();
324 let mut keys: Vec<&T> = self.data.keys().collect();
325 if keys.len() > 10 {
326 keys.select_nth_unstable(9);
327 keys.truncate(10);
328 }
329 keys.sort_unstable();
330 let antimodes: Vec<T> = keys.into_iter().cloned().collect();
331 return ((Vec::new(), 0, 0), (antimodes, total, 1));
332 }
333
334 if highest_count == lowest_count {
337 let mut keys: Vec<&T> = self.data.keys().collect();
338 if keys.len() > PARALLEL_THRESHOLD {
339 keys.par_sort_unstable();
340 } else {
341 keys.sort_unstable();
342 }
343 let total = keys.len();
344 let antimodes: Vec<T> = keys.iter().take(10).map(|k| (*k).clone()).collect();
345 let modes: Vec<T> = keys.into_iter().cloned().collect();
346 return (
347 (modes, total, highest_count_u32),
348 (antimodes, total, lowest_count_u32),
349 );
350 }
351
352 let mut modes: Vec<&T> = Vec::new();
355 let mut antimodes: Vec<&T> = Vec::new();
356 for (k, &c) in &self.data {
357 if c == highest_count {
358 modes.push(k);
359 } else if c == lowest_count {
360 antimodes.push(k);
361 }
362 }
363
364 if modes.len() > PARALLEL_THRESHOLD {
365 modes.par_sort_unstable();
366 } else {
367 modes.sort_unstable();
368 }
369 let antimodes_total = antimodes.len();
370 if antimodes_total > 10 {
371 antimodes.select_nth_unstable(9);
372 antimodes.truncate(10);
373 }
374 antimodes.sort_unstable();
375
376 (
377 (
378 modes.iter().map(|k| (*k).clone()).collect(),
379 modes.len(),
380 highest_count_u32,
381 ),
382 (
383 antimodes.into_iter().cloned().collect(),
384 antimodes_total,
385 lowest_count_u32,
386 ),
387 )
388 }
389}
390
391impl Frequencies<Vec<u8>> {
392 #[allow(clippy::inline_always)]
399 #[inline(always)]
400 pub fn add_borrowed(&mut self, v: &[u8]) {
401 *self.data.entry_ref(v).or_insert(0) += 1;
402 }
403
404 #[allow(clippy::inline_always)]
406 #[inline(always)]
407 pub fn increment_by_borrowed(&mut self, v: &[u8], count: u64) {
408 *self.data.entry_ref(v).or_insert(0) += count;
409 }
410
411 #[allow(clippy::inline_always)]
421 #[inline(always)]
422 pub fn add_borrowed_capped(&mut self, v: &[u8], cap: u64) -> bool {
423 let len = self.data.len() as u64;
424 match self.data.entry_ref(v) {
425 EntryRef::Occupied(mut e) => {
426 *e.get_mut() += 1;
427 true
428 }
429 EntryRef::Vacant(e) => {
430 if cap > 0 && len >= cap {
431 false
432 } else {
433 e.insert(1);
434 true
435 }
436 }
437 }
438 }
439}
440
441impl<T: Eq + Hash> Commute for Frequencies<T> {
442 #[inline]
443 fn merge(&mut self, v: Frequencies<T>) {
444 self.data.reserve(v.data.len());
446
447 for (k, v2) in v.data {
448 match self.data.entry(k) {
449 Entry::Vacant(v1) => {
450 v1.insert(v2);
451 }
452 Entry::Occupied(mut v1) => {
453 *v1.get_mut() += v2;
454 }
455 }
456 }
457 }
458}
459
460impl<T: Eq + Hash> Default for Frequencies<T> {
461 #[inline]
462 fn default() -> Frequencies<T> {
463 Frequencies {
464 data: HashMap::with_capacity(64),
465 }
466 }
467}
468
469impl<T: Eq + Hash> FromIterator<T> for Frequencies<T> {
470 #[inline]
471 fn from_iter<I: IntoIterator<Item = T>>(it: I) -> Frequencies<T> {
472 let mut v = Frequencies::new();
473 v.extend(it);
474 v
475 }
476}
477
478impl<T: Eq + Hash> Extend<T> for Frequencies<T> {
479 #[inline]
480 fn extend<I: IntoIterator<Item = T>>(&mut self, it: I) {
481 let iter = it.into_iter();
482 if let (lower, Some(upper)) = iter.size_hint()
484 && lower == upper
485 {
486 self.data.reserve(lower.saturating_sub(self.data.len()));
489 }
490 for sample in iter {
491 self.add(sample);
492 }
493 }
494}
495
496pub struct UniqueValues<'a, K> {
498 data_keys: Keys<'a, K, u64>,
499}
500
501impl<'a, K> Iterator for UniqueValues<'a, K> {
502 type Item = &'a K;
503 #[inline]
504 fn next(&mut self) -> Option<Self::Item> {
505 self.data_keys.next()
506 }
507
508 #[inline]
511 fn size_hint(&self) -> (usize, Option<usize>) {
512 self.data_keys.size_hint()
513 }
514}
515
516impl<K> ExactSizeIterator for UniqueValues<'_, K> {
517 #[inline]
518 fn len(&self) -> usize {
519 self.data_keys.len()
520 }
521}
522
523impl<K> std::iter::FusedIterator for UniqueValues<'_, K> {}
524
525#[cfg(test)]
526mod test {
527 use super::Frequencies;
528 use std::iter::FromIterator;
529
530 #[test]
531 fn ranked() {
532 let mut counts = Frequencies::new();
533 counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
534 let (most_count, most_total) = counts.most_frequent();
535 assert_eq!(most_count[0], (&2, 5));
536 assert_eq!(most_total, 11);
537 let (least_count, least_total) = counts.least_frequent();
538 assert_eq!(least_count[0], (&3, 1));
539 assert_eq!(least_total, 11);
540 assert_eq!(
541 counts.least_frequent(),
542 (vec![(&3, 1), (&1, 2), (&4, 3), (&2, 5)], 11)
543 );
544 }
545
546 #[test]
547 fn ranked2() {
548 let mut counts = Frequencies::new();
549 counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
550 let (most_count, most_total) = counts.par_frequent(false);
551 assert_eq!(most_count[0], (&2, 5));
552 assert_eq!(most_total, 11);
553 let (least_count, least_total) = counts.par_frequent(true);
554 assert_eq!(least_count[0], (&3, 1));
555 assert_eq!(least_total, 11);
556 }
557
558 #[test]
559 fn unique_values() {
560 let freqs = Frequencies::from_iter(vec![8, 6, 5, 1, 1, 2, 2, 2, 3, 4, 7, 4, 4]);
561 let mut unique: Vec<isize> = freqs.unique_values().copied().collect();
562 unique.sort_unstable();
563 assert_eq!(unique, vec![1, 2, 3, 4, 5, 6, 7, 8]);
564 }
565
566 #[test]
567 fn test_top_n() {
568 let mut freq = Frequencies::new();
569 freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
570
571 let top_2 = freq.top_n(2);
572 assert_eq!(top_2.len(), 2);
573 assert_eq!(top_2[0], (&4, 4)); assert_eq!(top_2[1], (&1, 3)); let bottom_2 = freq.bottom_n(2);
577 assert_eq!(bottom_2.len(), 2);
578 assert_eq!(bottom_2[0], (&3, 1)); assert_eq!(bottom_2[1], (&2, 2)); }
581
582 #[test]
587 fn top_n_matches_par_frequent_all_ties() {
588 let mut freq = Frequencies::new();
590 freq.extend(vec![1usize, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
591 for n in 0..=10usize {
592 let (full_desc, _) = freq.par_frequent(false);
593 let expected_desc: Vec<(&usize, u64)> = full_desc.into_iter().take(n).collect();
594 assert_eq!(freq.top_n(n), expected_desc, "top_n({n}) all-ties mismatch");
595
596 let (full_asc, _) = freq.par_frequent(true);
597 let expected_asc: Vec<(&usize, u64)> = full_asc.into_iter().take(n).collect();
598 assert_eq!(
599 freq.bottom_n(n),
600 expected_asc,
601 "bottom_n({n}) all-ties mismatch"
602 );
603 }
604 }
605
606 #[test]
607 fn top_n_matches_par_frequent_boundary_ties() {
608 let mut freq = Frequencies::new();
611 for v in 10..20usize {
612 freq.extend(vec![v, v]); }
614 for v in 20..25usize {
615 freq.extend(vec![v; 5]); }
617 for v in 30..35usize {
618 freq.extend(vec![v]); }
620 for n in 0..=freq.len() + 2 {
621 let (full_desc, _) = freq.par_frequent(false);
622 let expected_desc: Vec<(&usize, u64)> = full_desc.into_iter().take(n).collect();
623 assert_eq!(
624 freq.top_n(n),
625 expected_desc,
626 "top_n({n}) boundary-tie mismatch"
627 );
628
629 let (full_asc, _) = freq.par_frequent(true);
630 let expected_asc: Vec<(&usize, u64)> = full_asc.into_iter().take(n).collect();
631 assert_eq!(
632 freq.bottom_n(n),
633 expected_asc,
634 "bottom_n({n}) boundary-tie mismatch"
635 );
636 }
637 }
638
639 #[test]
640 fn test_count_methods() {
641 let mut freq = Frequencies::new();
642 freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
643
644 assert_eq!(freq.total_count(), 10);
646
647 assert!(freq.has_count(3)); assert!(freq.has_count(4)); assert!(freq.has_count(1)); assert!(!freq.has_count(5)); let items_with_3 = freq.items_with_count(3);
655 assert_eq!(items_with_3, vec![&1]); let items_with_2 = freq.items_with_count(2);
658 assert_eq!(items_with_2, vec![&2]); let items_with_1 = freq.items_with_count(1);
661 assert_eq!(items_with_1, vec![&3]); let items_with_4 = freq.items_with_count(4);
664 assert_eq!(items_with_4, vec![&4]); let items_with_5 = freq.items_with_count(5);
667 assert!(items_with_5.is_empty()); }
669
670 #[test]
671 fn add_borrowed_inserts_new_key() {
672 let mut freq = Frequencies::<Vec<u8>>::new();
673 freq.add_borrowed(b"hello");
674 assert_eq!(freq.count(&b"hello".to_vec()), 1);
675 assert_eq!(freq.cardinality(), 1);
676 }
677
678 #[test]
679 fn add_borrowed_increments_existing_key() {
680 let mut freq = Frequencies::<Vec<u8>>::new();
681 freq.add_borrowed(b"hello");
682 freq.add_borrowed(b"hello");
683 freq.add_borrowed(b"hello");
684 assert_eq!(freq.count(&b"hello".to_vec()), 3);
685 assert_eq!(freq.cardinality(), 1);
686
687 freq.increment_by_borrowed(b"world", 5);
689 assert_eq!(freq.count(&b"world".to_vec()), 5);
690 freq.increment_by_borrowed(b"world", 3);
691 assert_eq!(freq.count(&b"world".to_vec()), 8);
692 }
693
694 #[test]
695 fn borrowed_owned_interop_for_same_key() {
696 let mut freq = Frequencies::<Vec<u8>>::new();
697 freq.add(b"key".to_vec());
699 freq.add_borrowed(b"key");
701 freq.increment_by_borrowed(b"key", 3);
702 assert_eq!(freq.count(&b"key".to_vec()), 5);
704 assert_eq!(freq.cardinality(), 1);
705 }
706
707 #[test]
713 fn modes_antimodes_matches_unsorted() {
714 let mut seed = 0xDEAD_BEEF_u64;
716 let mut next = move |bound: u64| {
717 seed = seed.wrapping_mul(6_364_136_223_846_793_005).wrapping_add(1);
718 (seed >> 33) % bound
719 };
720
721 for case in 0..200 {
722 let n_samples = next(50) as usize;
725 let key_space = 1 + next(30);
726
727 let mut unsorted = crate::Unsorted::<Vec<u8>>::default();
728 let mut freqs = Frequencies::<Vec<u8>>::new();
729 for _ in 0..n_samples {
730 let key = format!("k{:02}", next(key_space)).into_bytes();
731 unsorted.add_bytes(&key);
732 freqs.add_borrowed(&key);
733 }
734
735 assert_eq!(
736 unsorted.cardinality(false, 1),
737 freqs.cardinality(),
738 "cardinality mismatch in case {case}"
739 );
740 assert_eq!(
741 unsorted.modes_antimodes(),
742 freqs.modes_antimodes(),
743 "modes/antimodes mismatch in case {case} (n={n_samples}, k={key_space})"
744 );
745 }
746
747 let mut u = crate::Unsorted::<Vec<u8>>::default();
750 let f = Frequencies::<Vec<u8>>::new();
751 assert_eq!(u.modes_antimodes(), f.modes_antimodes());
752
753 let mut u = crate::Unsorted::<Vec<u8>>::default();
755 let mut f = Frequencies::<Vec<u8>>::new();
756 for _ in 0..5 {
757 u.add_bytes(b"only");
758 f.add_borrowed(b"only");
759 }
760 assert_eq!(u.modes_antimodes(), f.modes_antimodes());
761
762 let mut u = crate::Unsorted::<Vec<u8>>::default();
764 let mut f = Frequencies::<Vec<u8>>::new();
765 for i in 0..15 {
766 let key = format!("u{i:02}").into_bytes();
767 u.add_bytes(&key);
768 f.add_borrowed(&key);
769 }
770 assert_eq!(u.modes_antimodes(), f.modes_antimodes());
771 }
772
773 #[test]
777 fn modes_antimodes_counts_above_u32_max() {
778 let big = u64::from(u32::MAX) + 10;
779
780 let mut f = Frequencies::new();
781 f.increment_by(b"big".to_vec(), big);
782 f.increment_by(b"bigger".to_vec(), big + 5);
783 f.increment_by(b"small".to_vec(), 3);
784
785 let ((modes, modes_count, mode_occ), (antimodes, anti_count, anti_occ)) =
786 f.modes_antimodes();
787 assert_eq!(modes, vec![b"bigger".to_vec()]);
790 assert_eq!(modes_count, 1);
791 assert_eq!(mode_occ, u32::MAX); assert_eq!(antimodes, vec![b"small".to_vec()]);
793 assert_eq!(anti_count, 1);
794 assert_eq!(anti_occ, 3);
795
796 let mut f2 = Frequencies::new();
799 f2.increment_by(b"a".to_vec(), big);
800 f2.increment_by(b"b".to_vec(), big + 1);
801 let ((modes2, _, mode_occ2), (antimodes2, _, anti_occ2)) = f2.modes_antimodes();
802 assert_eq!(modes2, vec![b"b".to_vec()]);
803 assert_eq!(antimodes2, vec![b"a".to_vec()]);
804 assert_eq!(mode_occ2, u32::MAX);
805 assert_eq!(anti_occ2, u32::MAX);
806 }
807}