1use std::hash::Hash;
2
3use hashbrown::HashMap;
4use hashbrown::hash_map::{Entry, Keys};
5
6use rayon::prelude::*;
7
8use crate::Commute;
9
10const PARALLEL_THRESHOLD: usize = 10_000;
11#[derive(Clone)]
13pub struct Frequencies<T> {
14 data: HashMap<T, u64>,
15}
16
17#[cfg(debug_assertions)]
18impl<T: std::fmt::Debug + Eq + Hash> std::fmt::Debug for Frequencies<T> {
19 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
20 write!(f, "{:?}", self.data)
21 }
22}
23
24impl<T: Eq + Hash> Frequencies<T> {
25 #[must_use]
27 pub fn new() -> Frequencies<T> {
28 Default::default()
29 }
30
31 #[must_use]
33 pub fn with_capacity(capacity: usize) -> Self {
34 Frequencies {
35 data: HashMap::with_capacity(capacity),
36 }
37 }
38
39 #[allow(clippy::inline_always)]
41 #[inline(always)]
42 pub fn add(&mut self, v: T) {
43 *self.data.entry(v).or_insert(0) += 1;
44 }
45
46 #[inline]
48 #[must_use]
49 pub fn count(&self, v: &T) -> u64 {
50 self.data.get(v).copied().unwrap_or(0)
51 }
52
53 #[inline]
55 #[must_use]
56 pub fn cardinality(&self) -> u64 {
57 self.len() as u64
58 }
59
60 fn collect_counts(&self) -> (Vec<(&T, u64)>, u64) {
62 let mut total_count = 0u64;
63 let counts: Vec<(&T, u64)> = self
64 .data
65 .iter()
66 .map(|(k, &v)| {
67 total_count += v;
68 (k, v)
69 })
70 .collect();
71 (counts, total_count)
72 }
73
74 #[inline]
77 #[must_use]
78 pub fn most_frequent(&self) -> (Vec<(&T, u64)>, u64) {
79 let (mut counts, total_count) = self.collect_counts();
80 counts.sort_unstable_by_key(|&(_, c)| std::cmp::Reverse(c));
81 (counts, total_count)
82 }
83
84 #[inline]
87 #[must_use]
88 pub fn least_frequent(&self) -> (Vec<(&T, u64)>, u64) {
89 let (mut counts, total_count) = self.collect_counts();
90 counts.sort_unstable_by_key(|&(_, c)| c);
91 (counts, total_count)
92 }
93
94 #[inline]
97 #[must_use]
98 pub fn par_frequent(&self, least: bool) -> (Vec<(&T, u64)>, u64)
99 where
100 for<'a> (&'a T, u64): Send,
101 T: Ord,
102 {
103 let (mut counts, total_count) = self.collect_counts();
104 if least {
109 let sort_fn =
111 |&(v1, c1): &(&T, u64), &(v2, c2): &(&T, u64)| c1.cmp(&c2).then_with(|| v1.cmp(v2));
112 if counts.len() < PARALLEL_THRESHOLD {
113 counts.sort_unstable_by(sort_fn);
114 } else {
115 counts.par_sort_unstable_by(sort_fn);
116 }
117 } else {
118 let sort_fn =
120 |&(v1, c1): &(&T, u64), &(v2, c2): &(&T, u64)| c2.cmp(&c1).then_with(|| v1.cmp(v2));
121 if counts.len() < PARALLEL_THRESHOLD {
122 counts.sort_unstable_by(sort_fn);
123 } else {
124 counts.par_sort_unstable_by(sort_fn);
125 }
126 }
127 (counts, total_count)
128 }
129
130 #[must_use]
132 pub fn len(&self) -> usize {
133 self.data.len()
134 }
135
136 #[must_use]
138 pub fn is_empty(&self) -> bool {
139 self.data.is_empty()
140 }
141
142 #[must_use]
144 pub fn unique_values(&self) -> UniqueValues<'_, T> {
145 UniqueValues {
146 data_keys: self.data.keys(),
147 }
148 }
149
150 #[must_use]
153 pub fn top_n(&self, n: usize) -> Vec<(&T, u64)>
154 where
155 T: Ord,
156 {
157 use std::collections::BinaryHeap;
158
159 let mut heap = BinaryHeap::with_capacity(n);
164
165 for (item, count) in &self.data {
166 let candidate = (*count, item);
167 if heap.len() < n {
168 heap.push(std::cmp::Reverse(candidate));
169 } else if let Some(top) = heap.peek()
170 && candidate > top.0
171 {
172 heap.pop();
173 heap.push(std::cmp::Reverse(candidate));
174 }
175 }
176
177 heap.into_sorted_vec()
179 .into_iter()
180 .map(|std::cmp::Reverse((count, item))| (item, count))
181 .collect()
182 }
183
184 #[must_use]
186 pub fn bottom_n(&self, n: usize) -> Vec<(&T, u64)>
187 where
188 T: Ord,
189 {
190 use std::collections::BinaryHeap;
191
192 let mut heap = BinaryHeap::with_capacity(n);
194
195 for (item, count) in &self.data {
196 let candidate = (*count, item);
197 if heap.len() < n {
198 heap.push(candidate);
199 } else if let Some(top) = heap.peek()
200 && candidate < *top
201 {
202 heap.pop();
203 heap.push(candidate);
204 }
205 }
206
207 heap.into_sorted_vec()
208 .into_iter()
209 .map(|(count, item)| (item, count))
210 .collect()
211 }
212
213 #[must_use]
215 pub fn items_with_count(&self, n: u64) -> Vec<&T> {
216 self.data
217 .iter()
218 .filter(|&(_, &count)| count == n)
219 .map(|(item, _)| item)
220 .collect()
221 }
222
223 #[must_use]
225 pub fn total_count(&self) -> u64 {
226 self.data.values().sum()
227 }
228
229 #[must_use]
231 pub fn has_count(&self, n: u64) -> bool {
232 self.data.values().any(|&count| count == n)
233 }
234
235 #[allow(clippy::inline_always)]
237 #[inline(always)]
238 pub fn increment_by(&mut self, v: T, count: u64) {
239 match self.data.entry(v) {
240 Entry::Vacant(entry) => {
241 entry.insert(count);
242 }
243 Entry::Occupied(mut entry) => {
244 *entry.get_mut() += count;
245 }
246 }
247 }
248}
249
250impl Frequencies<Vec<u8>> {
251 #[allow(clippy::inline_always)]
258 #[inline(always)]
259 pub fn add_borrowed(&mut self, v: &[u8]) {
260 *self.data.entry_ref(v).or_insert(0) += 1;
261 }
262
263 #[allow(clippy::inline_always)]
265 #[inline(always)]
266 pub fn increment_by_borrowed(&mut self, v: &[u8], count: u64) {
267 *self.data.entry_ref(v).or_insert(0) += count;
268 }
269}
270
271impl<T: Eq + Hash> Commute for Frequencies<T> {
272 #[inline]
273 fn merge(&mut self, v: Frequencies<T>) {
274 self.data.reserve(v.data.len());
276
277 for (k, v2) in v.data {
278 match self.data.entry(k) {
279 Entry::Vacant(v1) => {
280 v1.insert(v2);
281 }
282 Entry::Occupied(mut v1) => {
283 *v1.get_mut() += v2;
284 }
285 }
286 }
287 }
288}
289
290impl<T: Eq + Hash> Default for Frequencies<T> {
291 #[inline]
292 fn default() -> Frequencies<T> {
293 Frequencies {
294 data: HashMap::with_capacity(64),
295 }
296 }
297}
298
299impl<T: Eq + Hash> FromIterator<T> for Frequencies<T> {
300 #[inline]
301 fn from_iter<I: IntoIterator<Item = T>>(it: I) -> Frequencies<T> {
302 let mut v = Frequencies::new();
303 v.extend(it);
304 v
305 }
306}
307
308impl<T: Eq + Hash> Extend<T> for Frequencies<T> {
309 #[inline]
310 fn extend<I: IntoIterator<Item = T>>(&mut self, it: I) {
311 let iter = it.into_iter();
312 if let (lower, Some(upper)) = iter.size_hint()
314 && lower == upper
315 {
316 self.data.reserve(lower.saturating_sub(self.data.len()));
319 }
320 for sample in iter {
321 self.add(sample);
322 }
323 }
324}
325
326pub struct UniqueValues<'a, K> {
328 data_keys: Keys<'a, K, u64>,
329}
330
331impl<'a, K> Iterator for UniqueValues<'a, K> {
332 type Item = &'a K;
333 fn next(&mut self) -> Option<Self::Item> {
334 self.data_keys.next()
335 }
336}
337
338#[cfg(test)]
339mod test {
340 use super::Frequencies;
341 use std::iter::FromIterator;
342
343 #[test]
344 fn ranked() {
345 let mut counts = Frequencies::new();
346 counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
347 let (most_count, most_total) = counts.most_frequent();
348 assert_eq!(most_count[0], (&2, 5));
349 assert_eq!(most_total, 11);
350 let (least_count, least_total) = counts.least_frequent();
351 assert_eq!(least_count[0], (&3, 1));
352 assert_eq!(least_total, 11);
353 assert_eq!(
354 counts.least_frequent(),
355 (vec![(&3, 1), (&1, 2), (&4, 3), (&2, 5)], 11)
356 );
357 }
358
359 #[test]
360 fn ranked2() {
361 let mut counts = Frequencies::new();
362 counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
363 let (most_count, most_total) = counts.par_frequent(false);
364 assert_eq!(most_count[0], (&2, 5));
365 assert_eq!(most_total, 11);
366 let (least_count, least_total) = counts.par_frequent(true);
367 assert_eq!(least_count[0], (&3, 1));
368 assert_eq!(least_total, 11);
369 }
370
371 #[test]
372 fn unique_values() {
373 let freqs = Frequencies::from_iter(vec![8, 6, 5, 1, 1, 2, 2, 2, 3, 4, 7, 4, 4]);
374 let mut unique: Vec<isize> = freqs.unique_values().copied().collect();
375 unique.sort_unstable();
376 assert_eq!(unique, vec![1, 2, 3, 4, 5, 6, 7, 8]);
377 }
378
379 #[test]
380 fn test_top_n() {
381 let mut freq = Frequencies::new();
382 freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
383
384 let top_2 = freq.top_n(2);
385 assert_eq!(top_2.len(), 2);
386 assert_eq!(top_2[0], (&4, 4)); assert_eq!(top_2[1], (&1, 3)); let bottom_2 = freq.bottom_n(2);
390 assert_eq!(bottom_2.len(), 2);
391 assert_eq!(bottom_2[0], (&3, 1)); assert_eq!(bottom_2[1], (&2, 2)); }
394
395 #[test]
396 fn test_count_methods() {
397 let mut freq = Frequencies::new();
398 freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
399
400 assert_eq!(freq.total_count(), 10);
402
403 assert!(freq.has_count(3)); assert!(freq.has_count(4)); assert!(freq.has_count(1)); assert!(!freq.has_count(5)); let items_with_3 = freq.items_with_count(3);
411 assert_eq!(items_with_3, vec![&1]); let items_with_2 = freq.items_with_count(2);
414 assert_eq!(items_with_2, vec![&2]); let items_with_1 = freq.items_with_count(1);
417 assert_eq!(items_with_1, vec![&3]); let items_with_4 = freq.items_with_count(4);
420 assert_eq!(items_with_4, vec![&4]); let items_with_5 = freq.items_with_count(5);
423 assert!(items_with_5.is_empty()); }
425
426 #[test]
427 fn add_borrowed_inserts_new_key() {
428 let mut freq = Frequencies::<Vec<u8>>::new();
429 freq.add_borrowed(b"hello");
430 assert_eq!(freq.count(&b"hello".to_vec()), 1);
431 assert_eq!(freq.cardinality(), 1);
432 }
433
434 #[test]
435 fn add_borrowed_increments_existing_key() {
436 let mut freq = Frequencies::<Vec<u8>>::new();
437 freq.add_borrowed(b"hello");
438 freq.add_borrowed(b"hello");
439 freq.add_borrowed(b"hello");
440 assert_eq!(freq.count(&b"hello".to_vec()), 3);
441 assert_eq!(freq.cardinality(), 1);
442
443 freq.increment_by_borrowed(b"world", 5);
445 assert_eq!(freq.count(&b"world".to_vec()), 5);
446 freq.increment_by_borrowed(b"world", 3);
447 assert_eq!(freq.count(&b"world".to_vec()), 8);
448 }
449
450 #[test]
451 fn borrowed_owned_interop_for_same_key() {
452 let mut freq = Frequencies::<Vec<u8>>::new();
453 freq.add(b"key".to_vec());
455 freq.add_borrowed(b"key");
457 freq.increment_by_borrowed(b"key", 3);
458 assert_eq!(freq.count(&b"key".to_vec()), 5);
460 assert_eq!(freq.cardinality(), 1);
461 }
462}