1use std::collections::hash_map::{Entry, Keys};
2use std::hash::Hash;
3
4use foldhash::{HashMap, HashMapExt};
5
6use rayon::prelude::*;
7
8use crate::Commute;
9
10const PARALLEL_THRESHOLD: usize = 10_000;
11#[derive(Clone)]
13pub struct Frequencies<T> {
14 data: HashMap<T, u64>,
15}
16
17#[cfg(debug_assertions)]
18impl<T: std::fmt::Debug + Eq + Hash> std::fmt::Debug for Frequencies<T> {
19 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
20 write!(f, "{:?}", self.data)
21 }
22}
23
24impl<T: Eq + Hash> Frequencies<T> {
25 #[must_use]
27 pub fn new() -> Frequencies<T> {
28 Default::default()
29 }
30
31 #[must_use]
33 pub fn with_capacity(capacity: usize) -> Self {
34 Frequencies {
35 data: HashMap::with_capacity(capacity),
36 }
37 }
38
39 #[inline]
41 pub fn add(&mut self, v: T) {
42 *self.data.entry(v).or_insert(0) += 1;
43 }
44
45 #[inline]
47 #[must_use]
48 pub fn count(&self, v: &T) -> u64 {
49 self.data.get(v).copied().unwrap_or(0)
50 }
51
52 #[inline]
54 #[must_use]
55 pub fn cardinality(&self) -> u64 {
56 self.len() as u64
57 }
58
59 fn collect_counts(&self) -> (Vec<(&T, u64)>, u64) {
61 let mut total_count = 0u64;
62 let counts: Vec<(&T, u64)> = self
63 .data
64 .iter()
65 .map(|(k, &v)| {
66 total_count += v;
67 (k, v)
68 })
69 .collect();
70 (counts, total_count)
71 }
72
73 #[inline]
76 #[must_use]
77 pub fn most_frequent(&self) -> (Vec<(&T, u64)>, u64) {
78 let (mut counts, total_count) = self.collect_counts();
79 counts.sort_unstable_by(|&(_, c1), &(_, c2)| c2.cmp(&c1));
80 (counts, total_count)
81 }
82
83 #[inline]
86 #[must_use]
87 pub fn least_frequent(&self) -> (Vec<(&T, u64)>, u64) {
88 let (mut counts, total_count) = self.collect_counts();
89 counts.sort_unstable_by(|&(_, c1), &(_, c2)| c1.cmp(&c2));
90 (counts, total_count)
91 }
92
93 #[inline]
96 #[must_use]
97 pub fn par_frequent(&self, least: bool) -> (Vec<(&T, u64)>, u64)
98 where
99 for<'a> (&'a T, u64): Send,
100 T: Ord,
101 {
102 let (mut counts, total_count) = self.collect_counts();
103 if least {
108 let sort_fn = |&(v1, c1): &(&T, u64), &(v2, c2): &(&T, u64)| {
110 c1.cmp(&c2).then_with(|| v1.cmp(v2))
111 };
112 if counts.len() < PARALLEL_THRESHOLD {
113 counts.sort_unstable_by(sort_fn);
114 } else {
115 counts.par_sort_unstable_by(sort_fn);
116 }
117 } else {
118 let sort_fn = |&(v1, c1): &(&T, u64), &(v2, c2): &(&T, u64)| {
120 c2.cmp(&c1).then_with(|| v1.cmp(v2))
121 };
122 if counts.len() < PARALLEL_THRESHOLD {
123 counts.sort_unstable_by(sort_fn);
124 } else {
125 counts.par_sort_unstable_by(sort_fn);
126 }
127 }
128 (counts, total_count)
129 }
130
131 #[must_use]
133 pub fn len(&self) -> usize {
134 self.data.len()
135 }
136
137 #[must_use]
139 pub fn is_empty(&self) -> bool {
140 self.data.is_empty()
141 }
142
143 #[must_use]
145 pub fn unique_values(&self) -> UniqueValues<'_, T> {
146 UniqueValues {
147 data_keys: self.data.keys(),
148 }
149 }
150
151 #[must_use]
154 pub fn top_n(&self, n: usize) -> Vec<(&T, u64)>
155 where
156 T: Ord,
157 {
158 use std::collections::BinaryHeap;
159
160 let mut heap = BinaryHeap::with_capacity(n + 1);
162
163 for (item, count) in &self.data {
164 heap.push(std::cmp::Reverse((*count, item)));
167
168 if heap.len() > n {
170 heap.pop();
171 }
172 }
173
174 heap.into_sorted_vec()
176 .into_iter()
177 .map(|std::cmp::Reverse((count, item))| (item, count))
178 .collect()
179 }
180
181 #[must_use]
183 pub fn bottom_n(&self, n: usize) -> Vec<(&T, u64)>
184 where
185 T: Ord,
186 {
187 use std::collections::BinaryHeap;
188
189 let mut heap = BinaryHeap::with_capacity(n + 1);
190
191 for (item, count) in &self.data {
192 heap.push((*count, item));
193 if heap.len() > n {
194 heap.pop();
195 }
196 }
197
198 heap.into_sorted_vec()
199 .into_iter()
200 .map(|(count, item)| (item, count))
201 .collect()
202 }
203
204 #[must_use]
206 pub fn items_with_count(&self, n: u64) -> Vec<&T> {
207 self.data
208 .iter()
209 .filter(|&(_, &count)| count == n)
210 .map(|(item, _)| item)
211 .collect()
212 }
213
214 #[must_use]
216 pub fn total_count(&self) -> u64 {
217 self.data.values().sum()
218 }
219
220 #[must_use]
222 pub fn has_count(&self, n: u64) -> bool {
223 self.data.values().any(|&count| count == n)
224 }
225
226 #[inline]
228 pub fn increment_by(&mut self, v: T, count: u64) {
229 match self.data.entry(v) {
230 Entry::Vacant(entry) => {
231 entry.insert(count);
232 }
233 Entry::Occupied(mut entry) => {
234 *entry.get_mut() += count;
235 }
236 }
237 }
238}
239
240impl Frequencies<Vec<u8>> {
241 #[inline]
246 pub fn add_borrowed(&mut self, v: &[u8]) {
247 if let Some(count) = self.data.get_mut(v) {
248 *count += 1;
249 } else {
250 self.data.insert(v.to_vec(), 1);
251 }
252 }
253
254 #[inline]
256 pub fn increment_by_borrowed(&mut self, v: &[u8], count: u64) {
257 if let Some(existing) = self.data.get_mut(v) {
258 *existing += count;
259 } else {
260 self.data.insert(v.to_vec(), count);
261 }
262 }
263}
264
265impl<T: Eq + Hash> Commute for Frequencies<T> {
266 #[inline]
267 fn merge(&mut self, v: Frequencies<T>) {
268 self.data.reserve(v.data.len());
270
271 for (k, v2) in v.data {
272 match self.data.entry(k) {
273 Entry::Vacant(v1) => {
274 v1.insert(v2);
275 }
276 Entry::Occupied(mut v1) => {
277 *v1.get_mut() += v2;
278 }
279 }
280 }
281 }
282}
283
284impl<T: Eq + Hash> Default for Frequencies<T> {
285 #[inline]
286 fn default() -> Frequencies<T> {
287 Frequencies {
288 data: HashMap::with_capacity(64),
289 }
290 }
291}
292
293impl<T: Eq + Hash> FromIterator<T> for Frequencies<T> {
294 #[inline]
295 fn from_iter<I: IntoIterator<Item = T>>(it: I) -> Frequencies<T> {
296 let mut v = Frequencies::new();
297 v.extend(it);
298 v
299 }
300}
301
302impl<T: Eq + Hash> Extend<T> for Frequencies<T> {
303 #[inline]
304 fn extend<I: IntoIterator<Item = T>>(&mut self, it: I) {
305 let iter = it.into_iter();
306 if let (lower, Some(upper)) = iter.size_hint()
308 && lower == upper
309 {
310 self.data.reserve(lower.saturating_sub(self.data.len()));
313 }
314 for sample in iter {
315 self.add(sample);
316 }
317 }
318}
319
320pub struct UniqueValues<'a, K> {
322 data_keys: Keys<'a, K, u64>,
323}
324
325impl<'a, K> Iterator for UniqueValues<'a, K> {
326 type Item = &'a K;
327 fn next(&mut self) -> Option<Self::Item> {
328 self.data_keys.next()
329 }
330}
331
332#[cfg(test)]
333mod test {
334 use super::Frequencies;
335 use std::iter::FromIterator;
336
337 #[test]
338 fn ranked() {
339 let mut counts = Frequencies::new();
340 counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
341 let (most_count, most_total) = counts.most_frequent();
342 assert_eq!(most_count[0], (&2, 5));
343 assert_eq!(most_total, 11);
344 let (least_count, least_total) = counts.least_frequent();
345 assert_eq!(least_count[0], (&3, 1));
346 assert_eq!(least_total, 11);
347 assert_eq!(
348 counts.least_frequent(),
349 (vec![(&3, 1), (&1, 2), (&4, 3), (&2, 5)], 11)
350 );
351 }
352
353 #[test]
354 fn ranked2() {
355 let mut counts = Frequencies::new();
356 counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
357 let (most_count, most_total) = counts.par_frequent(false);
358 assert_eq!(most_count[0], (&2, 5));
359 assert_eq!(most_total, 11);
360 let (least_count, least_total) = counts.par_frequent(true);
361 assert_eq!(least_count[0], (&3, 1));
362 assert_eq!(least_total, 11);
363 }
364
365 #[test]
366 fn unique_values() {
367 let freqs = Frequencies::from_iter(vec![8, 6, 5, 1, 1, 2, 2, 2, 3, 4, 7, 4, 4]);
368 let mut unique: Vec<isize> = freqs.unique_values().copied().collect();
369 unique.sort_unstable();
370 assert_eq!(unique, vec![1, 2, 3, 4, 5, 6, 7, 8]);
371 }
372
373 #[test]
374 fn test_top_n() {
375 let mut freq = Frequencies::new();
376 freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
377
378 let top_2 = freq.top_n(2);
379 assert_eq!(top_2.len(), 2);
380 assert_eq!(top_2[0], (&4, 4)); assert_eq!(top_2[1], (&1, 3)); let bottom_2 = freq.bottom_n(2);
384 assert_eq!(bottom_2.len(), 2);
385 assert_eq!(bottom_2[0], (&3, 1)); assert_eq!(bottom_2[1], (&2, 2)); }
388
389 #[test]
390 fn test_count_methods() {
391 let mut freq = Frequencies::new();
392 freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
393
394 assert_eq!(freq.total_count(), 10);
396
397 assert!(freq.has_count(3)); assert!(freq.has_count(4)); assert!(freq.has_count(1)); assert!(!freq.has_count(5)); let items_with_3 = freq.items_with_count(3);
405 assert_eq!(items_with_3, vec![&1]); let items_with_2 = freq.items_with_count(2);
408 assert_eq!(items_with_2, vec![&2]); let items_with_1 = freq.items_with_count(1);
411 assert_eq!(items_with_1, vec![&3]); let items_with_4 = freq.items_with_count(4);
414 assert_eq!(items_with_4, vec![&4]); let items_with_5 = freq.items_with_count(5);
417 assert!(items_with_5.is_empty()); }
419
420 #[test]
421 fn add_borrowed_inserts_new_key() {
422 let mut freq = Frequencies::<Vec<u8>>::new();
423 freq.add_borrowed(b"hello");
424 assert_eq!(freq.count(&b"hello".to_vec()), 1);
425 assert_eq!(freq.cardinality(), 1);
426 }
427
428 #[test]
429 fn add_borrowed_increments_existing_key() {
430 let mut freq = Frequencies::<Vec<u8>>::new();
431 freq.add_borrowed(b"hello");
432 freq.add_borrowed(b"hello");
433 freq.add_borrowed(b"hello");
434 assert_eq!(freq.count(&b"hello".to_vec()), 3);
435 assert_eq!(freq.cardinality(), 1);
436
437 freq.increment_by_borrowed(b"world", 5);
439 assert_eq!(freq.count(&b"world".to_vec()), 5);
440 freq.increment_by_borrowed(b"world", 3);
441 assert_eq!(freq.count(&b"world".to_vec()), 8);
442 }
443
444 #[test]
445 fn borrowed_owned_interop_for_same_key() {
446 let mut freq = Frequencies::<Vec<u8>>::new();
447 freq.add(b"key".to_vec());
449 freq.add_borrowed(b"key");
451 freq.increment_by_borrowed(b"key", 3);
452 assert_eq!(freq.count(&b"key".to_vec()), 5);
454 assert_eq!(freq.cardinality(), 1);
455 }
456}