1use std::collections::hash_map::{Entry, Keys};
2use std::hash::Hash;
3
4use foldhash::{HashMap, HashMapExt};
5
6use rayon::prelude::*;
7
8use crate::Commute;
9#[derive(Clone)]
11pub struct Frequencies<T> {
12 data: HashMap<T, u64>,
13}
14
15#[cfg(debug_assertions)]
16impl<T: std::fmt::Debug + Eq + Hash> std::fmt::Debug for Frequencies<T> {
17 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
18 write!(f, "{:?}", self.data)
19 }
20}
21
22impl<T: Eq + Hash> Frequencies<T> {
23 #[must_use]
25 pub fn new() -> Frequencies<T> {
26 Default::default()
27 }
28
29 #[must_use]
31 pub fn with_capacity(capacity: usize) -> Self {
32 Frequencies {
33 data: HashMap::with_capacity(capacity),
34 }
35 }
36
37 #[inline]
39 pub fn add(&mut self, v: T) {
40 *self.data.entry(v).or_insert(0) += 1;
41 }
42
43 #[inline]
45 #[must_use]
46 pub fn count(&self, v: &T) -> u64 {
47 self.data.get(v).copied().unwrap_or(0)
48 }
49
50 #[inline]
52 #[must_use]
53 pub fn cardinality(&self) -> u64 {
54 self.len() as u64
55 }
56
57 #[inline]
60 #[must_use]
61 pub fn most_frequent(&self) -> (Vec<(&T, u64)>, u64) {
62 let len = self.data.len();
63 let total_count: u64 = self.data.values().sum();
64 let mut counts = Vec::with_capacity(len);
65
66 for (k, &v) in &self.data {
67 counts.push((k, v));
68 }
69 counts.sort_unstable_by(|&(_, c1), &(_, c2)| c2.cmp(&c1));
70 (counts, total_count)
71 }
72
73 #[inline]
76 #[must_use]
77 pub fn least_frequent(&self) -> (Vec<(&T, u64)>, u64) {
78 let len = self.data.len();
79 let total_count: u64 = self.data.values().sum();
80 let mut counts = Vec::with_capacity(len);
81
82 for (k, &v) in &self.data {
83 counts.push((k, v));
84 }
85 counts.sort_unstable_by(|&(_, c1), &(_, c2)| c1.cmp(&c2));
86 (counts, total_count)
87 }
88
89 #[inline]
92 #[must_use]
93 pub fn par_frequent(&self, least: bool) -> (Vec<(&T, u64)>, u64)
94 where
95 for<'a> (&'a T, u64): Send,
96 T: Ord,
97 {
98 let len = self.data.len();
99 let total_count: u64 = self.data.values().sum();
100 let mut counts = Vec::with_capacity(len);
101
102 for (k, &v) in &self.data {
103 counts.push((k, v));
104 }
105 if least {
109 counts.par_sort_unstable_by(|&(v1, c1), &(v2, c2)| {
111 let cmp = c1.cmp(&c2);
112 if cmp == std::cmp::Ordering::Equal {
113 v1.cmp(v2)
114 } else {
115 cmp
116 }
117 });
118 } else {
119 counts
121 .par_sort_unstable_by(|&(v1, c1), &(v2, c2)| c2.cmp(&c1).then_with(|| v1.cmp(v2)));
122 }
123 (counts, total_count)
124 }
125
126 #[must_use]
128 pub fn len(&self) -> usize {
129 self.data.len()
130 }
131
132 #[must_use]
134 pub fn is_empty(&self) -> bool {
135 self.data.is_empty()
136 }
137
138 #[must_use]
140 pub fn unique_values(&self) -> UniqueValues<'_, T> {
141 UniqueValues {
142 data_keys: self.data.keys(),
143 }
144 }
145
146 #[must_use]
149 pub fn top_n(&self, n: usize) -> Vec<(&T, u64)>
150 where
151 T: Ord,
152 {
153 use std::collections::BinaryHeap;
154
155 let mut heap = BinaryHeap::with_capacity(n + 1);
157
158 for (item, count) in &self.data {
159 heap.push(std::cmp::Reverse((*count, item)));
162
163 if heap.len() > n {
165 heap.pop();
166 }
167 }
168
169 heap.into_sorted_vec()
171 .into_iter()
172 .map(|std::cmp::Reverse((count, item))| (item, count))
173 .collect()
174 }
175
176 #[must_use]
178 pub fn bottom_n(&self, n: usize) -> Vec<(&T, u64)>
179 where
180 T: Ord,
181 {
182 use std::collections::BinaryHeap;
183
184 let mut heap = BinaryHeap::with_capacity(n + 1);
185
186 for (item, count) in &self.data {
187 heap.push((*count, item));
188 if heap.len() > n {
189 heap.pop();
190 }
191 }
192
193 heap.into_sorted_vec()
194 .into_iter()
195 .map(|(count, item)| (item, count))
196 .collect()
197 }
198
199 #[must_use]
201 pub fn items_with_count(&self, n: u64) -> Vec<&T> {
202 self.data
203 .iter()
204 .filter(|&(_, &count)| count == n)
205 .map(|(item, _)| item)
206 .collect()
207 }
208
209 #[must_use]
211 pub fn total_count(&self) -> u64 {
212 self.data.values().sum()
213 }
214
215 #[must_use]
217 pub fn has_count(&self, n: u64) -> bool {
218 self.data.values().any(|&count| count == n)
219 }
220
221 #[inline]
223 pub fn increment_by(&mut self, v: T, count: u64) {
224 match self.data.entry(v) {
225 Entry::Vacant(entry) => {
226 entry.insert(count);
227 }
228 Entry::Occupied(mut entry) => {
229 *entry.get_mut() += count;
230 }
231 }
232 }
233}
234
235impl Frequencies<Vec<u8>> {
236 #[inline]
241 pub fn add_borrowed(&mut self, v: &[u8]) {
242 if let Some(count) = self.data.get_mut(v) {
243 *count += 1;
244 } else {
245 self.data.insert(v.to_vec(), 1);
246 }
247 }
248
249 #[inline]
251 pub fn increment_by_borrowed(&mut self, v: &[u8], count: u64) {
252 if let Some(existing) = self.data.get_mut(v) {
253 *existing += count;
254 } else {
255 self.data.insert(v.to_vec(), count);
256 }
257 }
258}
259
260impl<T: Eq + Hash> Commute for Frequencies<T> {
261 #[inline]
262 fn merge(&mut self, v: Frequencies<T>) {
263 self.data.reserve(v.data.len());
265
266 for (k, v2) in v.data {
267 match self.data.entry(k) {
268 Entry::Vacant(v1) => {
269 v1.insert(v2);
270 }
271 Entry::Occupied(mut v1) => {
272 *v1.get_mut() += v2;
273 }
274 }
275 }
276 }
277}
278
279impl<T: Eq + Hash> Default for Frequencies<T> {
280 #[inline]
281 fn default() -> Frequencies<T> {
282 Frequencies {
283 data: HashMap::with_capacity(1_000),
284 }
285 }
286}
287
288impl<T: Eq + Hash> FromIterator<T> for Frequencies<T> {
289 #[inline]
290 fn from_iter<I: IntoIterator<Item = T>>(it: I) -> Frequencies<T> {
291 let mut v = Frequencies::new();
292 v.extend(it);
293 v
294 }
295}
296
297impl<T: Eq + Hash> Extend<T> for Frequencies<T> {
298 #[inline]
299 fn extend<I: IntoIterator<Item = T>>(&mut self, it: I) {
300 let iter = it.into_iter();
301 if let (lower, Some(upper)) = iter.size_hint()
303 && lower == upper
304 {
305 self.data.reserve(lower.saturating_sub(self.data.len()));
308 }
309 for sample in iter {
310 self.add(sample);
311 }
312 }
313}
314
315pub struct UniqueValues<'a, K> {
317 data_keys: Keys<'a, K, u64>,
318}
319
320impl<'a, K> Iterator for UniqueValues<'a, K> {
321 type Item = &'a K;
322 fn next(&mut self) -> Option<Self::Item> {
323 self.data_keys.next()
324 }
325}
326
327#[cfg(test)]
328mod test {
329 use super::Frequencies;
330 use std::iter::FromIterator;
331
332 #[test]
333 fn ranked() {
334 let mut counts = Frequencies::new();
335 counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
336 let (most_count, most_total) = counts.most_frequent();
337 assert_eq!(most_count[0], (&2, 5));
338 assert_eq!(most_total, 11);
339 let (least_count, least_total) = counts.least_frequent();
340 assert_eq!(least_count[0], (&3, 1));
341 assert_eq!(least_total, 11);
342 assert_eq!(
343 counts.least_frequent(),
344 (vec![(&3, 1), (&1, 2), (&4, 3), (&2, 5)], 11)
345 );
346 }
347
348 #[test]
349 fn ranked2() {
350 let mut counts = Frequencies::new();
351 counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
352 let (most_count, most_total) = counts.par_frequent(false);
353 assert_eq!(most_count[0], (&2, 5));
354 assert_eq!(most_total, 11);
355 let (least_count, least_total) = counts.par_frequent(true);
356 assert_eq!(least_count[0], (&3, 1));
357 assert_eq!(least_total, 11);
358 }
359
360 #[test]
361 fn unique_values() {
362 let freqs = Frequencies::from_iter(vec![8, 6, 5, 1, 1, 2, 2, 2, 3, 4, 7, 4, 4]);
363 let mut unique: Vec<isize> = freqs.unique_values().copied().collect();
364 unique.sort_unstable();
365 assert_eq!(unique, vec![1, 2, 3, 4, 5, 6, 7, 8]);
366 }
367
368 #[test]
369 fn test_top_n() {
370 let mut freq = Frequencies::new();
371 freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
372
373 let top_2 = freq.top_n(2);
374 assert_eq!(top_2.len(), 2);
375 assert_eq!(top_2[0], (&4, 4)); assert_eq!(top_2[1], (&1, 3)); let bottom_2 = freq.bottom_n(2);
379 assert_eq!(bottom_2.len(), 2);
380 assert_eq!(bottom_2[0], (&3, 1)); assert_eq!(bottom_2[1], (&2, 2)); }
383
384 #[test]
385 fn test_count_methods() {
386 let mut freq = Frequencies::new();
387 freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
388
389 assert_eq!(freq.total_count(), 10);
391
392 assert!(freq.has_count(3)); assert!(freq.has_count(4)); assert!(freq.has_count(1)); assert!(!freq.has_count(5)); let items_with_3 = freq.items_with_count(3);
400 assert_eq!(items_with_3, vec![&1]); let items_with_2 = freq.items_with_count(2);
403 assert_eq!(items_with_2, vec![&2]); let items_with_1 = freq.items_with_count(1);
406 assert_eq!(items_with_1, vec![&3]); let items_with_4 = freq.items_with_count(4);
409 assert_eq!(items_with_4, vec![&4]); let items_with_5 = freq.items_with_count(5);
412 assert!(items_with_5.is_empty()); }
414
415 #[test]
416 fn add_borrowed_inserts_new_key() {
417 let mut freq = Frequencies::<Vec<u8>>::new();
418 freq.add_borrowed(b"hello");
419 assert_eq!(freq.count(&b"hello".to_vec()), 1);
420 assert_eq!(freq.cardinality(), 1);
421 }
422
423 #[test]
424 fn add_borrowed_increments_existing_key() {
425 let mut freq = Frequencies::<Vec<u8>>::new();
426 freq.add_borrowed(b"hello");
427 freq.add_borrowed(b"hello");
428 freq.add_borrowed(b"hello");
429 assert_eq!(freq.count(&b"hello".to_vec()), 3);
430 assert_eq!(freq.cardinality(), 1);
431
432 freq.increment_by_borrowed(b"world", 5);
434 assert_eq!(freq.count(&b"world".to_vec()), 5);
435 freq.increment_by_borrowed(b"world", 3);
436 assert_eq!(freq.count(&b"world".to_vec()), 8);
437 }
438
439 #[test]
440 fn borrowed_owned_interop_for_same_key() {
441 let mut freq = Frequencies::<Vec<u8>>::new();
442 freq.add(b"key".to_vec());
444 freq.add_borrowed(b"key");
446 freq.increment_by_borrowed(b"key", 3);
447 assert_eq!(freq.count(&b"key".to_vec()), 5);
449 assert_eq!(freq.cardinality(), 1);
450 }
451}