Skip to main content

stats/
frequency.rs

1use std::hash::Hash;
2
3use hashbrown::HashMap;
4use hashbrown::hash_map::{Entry, Keys};
5
6use rayon::prelude::*;
7
8use crate::Commute;
9
10const PARALLEL_THRESHOLD: usize = 10_000;
11/// A commutative data structure for exact frequency counts.
12#[derive(Clone)]
13pub struct Frequencies<T> {
14    data: HashMap<T, u64>,
15}
16
17#[cfg(debug_assertions)]
18impl<T: std::fmt::Debug + Eq + Hash> std::fmt::Debug for Frequencies<T> {
19    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
20        write!(f, "{:?}", self.data)
21    }
22}
23
24impl<T: Eq + Hash> Frequencies<T> {
25    /// Create a new frequency table with no samples.
26    #[must_use]
27    pub fn new() -> Frequencies<T> {
28        Default::default()
29    }
30
31    // Add constructor with configurable capacity
32    #[must_use]
33    pub fn with_capacity(capacity: usize) -> Self {
34        Frequencies {
35            data: HashMap::with_capacity(capacity),
36        }
37    }
38
39    /// Add a value to the frequency table.
40    #[allow(clippy::inline_always)]
41    #[inline(always)]
42    pub fn add(&mut self, v: T) {
43        *self.data.entry(v).or_insert(0) += 1;
44    }
45
46    /// Return the number of occurrences of `v` in the data.
47    #[inline]
48    #[must_use]
49    pub fn count(&self, v: &T) -> u64 {
50        self.data.get(v).copied().unwrap_or(0)
51    }
52
53    /// Return the cardinality (number of unique elements) in the data.
54    #[inline]
55    #[must_use]
56    pub fn cardinality(&self) -> u64 {
57        self.len() as u64
58    }
59
60    /// Collect counts and total in a single pass, reused by `most/least_frequent`.
61    fn collect_counts(&self) -> (Vec<(&T, u64)>, u64) {
62        let mut total_count = 0u64;
63        let counts: Vec<(&T, u64)> = self
64            .data
65            .iter()
66            .map(|(k, &v)| {
67                total_count += v;
68                (k, v)
69            })
70            .collect();
71        (counts, total_count)
72    }
73
74    /// Return a `Vec` of elements, their corresponding counts in
75    /// descending order, and the total count.
76    #[inline]
77    #[must_use]
78    pub fn most_frequent(&self) -> (Vec<(&T, u64)>, u64) {
79        let (mut counts, total_count) = self.collect_counts();
80        counts.sort_unstable_by_key(|&(_, c)| std::cmp::Reverse(c));
81        (counts, total_count)
82    }
83
84    /// Return a `Vec` of elements, their corresponding counts in
85    /// ascending order, and the total count.
86    #[inline]
87    #[must_use]
88    pub fn least_frequent(&self) -> (Vec<(&T, u64)>, u64) {
89        let (mut counts, total_count) = self.collect_counts();
90        counts.sort_unstable_by_key(|&(_, c)| c);
91        (counts, total_count)
92    }
93
94    /// Return a `Vec` of elements, their corresponding counts in order
95    /// based on the `least` parameter, and the total count. Uses parallel sort.
96    #[inline]
97    #[must_use]
98    pub fn par_frequent(&self, least: bool) -> (Vec<(&T, u64)>, u64)
99    where
100        for<'a> (&'a T, u64): Send,
101        T: Ord,
102    {
103        let (mut counts, total_count) = self.collect_counts();
104        // sort by counts asc/desc
105        // if counts are equal, sort by values lexicographically
106        // We need to do this because otherwise the values are not guaranteed to be in order for
107        // equal counts
108        if least {
109            // return counts in ascending order
110            let sort_fn =
111                |&(v1, c1): &(&T, u64), &(v2, c2): &(&T, u64)| c1.cmp(&c2).then_with(|| v1.cmp(v2));
112            if counts.len() < PARALLEL_THRESHOLD {
113                counts.sort_unstable_by(sort_fn);
114            } else {
115                counts.par_sort_unstable_by(sort_fn);
116            }
117        } else {
118            // return counts in descending order
119            let sort_fn =
120                |&(v1, c1): &(&T, u64), &(v2, c2): &(&T, u64)| c2.cmp(&c1).then_with(|| v1.cmp(v2));
121            if counts.len() < PARALLEL_THRESHOLD {
122                counts.sort_unstable_by(sort_fn);
123            } else {
124                counts.par_sort_unstable_by(sort_fn);
125            }
126        }
127        (counts, total_count)
128    }
129
130    /// Returns the cardinality of the data.
131    #[must_use]
132    pub fn len(&self) -> usize {
133        self.data.len()
134    }
135
136    /// Returns true if there is no frequency/cardinality data.
137    #[must_use]
138    pub fn is_empty(&self) -> bool {
139        self.data.is_empty()
140    }
141
142    /// Return an iterator over the unique values of the data.
143    #[must_use]
144    pub fn unique_values(&self) -> UniqueValues<'_, T> {
145        UniqueValues {
146            data_keys: self.data.keys(),
147        }
148    }
149
150    /// Get the top N most frequent items without sorting the entire vector
151    /// This is much faster than `most_frequent()` when you only need a few items
152    #[must_use]
153    pub fn top_n(&self, n: usize) -> Vec<(&T, u64)>
154    where
155        T: Ord,
156    {
157        use std::collections::BinaryHeap;
158
159        // Ties are broken by value *ascending* so the result matches
160        // `most_frequent()` / `par_frequent(false)` (count desc, value asc).
161        // We achieve this by ranking each entry on `(count, Reverse(item))`:
162        // a larger count wins, and on a count tie the lexicographically smaller
163        // value wins (its `Reverse(item)` is larger).
164        //
165        // Min-heap (via the outer Reverse) of the n best entries seen so far.
166        // peek() returns the worst entry currently in the top-N — replace it
167        // only when a strictly better candidate comes in. Avoids a push+pop per
168        // rejected element on high-cardinality inputs with small n.
169        let mut heap = BinaryHeap::with_capacity(n);
170
171        for (item, count) in &self.data {
172            let candidate = (*count, std::cmp::Reverse(item));
173            if heap.len() < n {
174                heap.push(std::cmp::Reverse(candidate));
175            } else if let Some(top) = heap.peek()
176                && candidate > top.0
177            {
178                heap.pop();
179                heap.push(std::cmp::Reverse(candidate));
180            }
181        }
182
183        // Convert to sorted vector (count desc, value asc)
184        heap.into_sorted_vec()
185            .into_iter()
186            .map(|std::cmp::Reverse((count, std::cmp::Reverse(item)))| (item, count))
187            .collect()
188    }
189
190    /// Similar to `top_n` but for least frequent items
191    #[must_use]
192    pub fn bottom_n(&self, n: usize) -> Vec<(&T, u64)>
193    where
194        T: Ord,
195    {
196        use std::collections::BinaryHeap;
197
198        // Max-heap of the n smallest elements seen so far. Mirror of top_n.
199        let mut heap = BinaryHeap::with_capacity(n);
200
201        for (item, count) in &self.data {
202            let candidate = (*count, item);
203            if heap.len() < n {
204                heap.push(candidate);
205            } else if let Some(top) = heap.peek()
206                && candidate < *top
207            {
208                heap.pop();
209                heap.push(candidate);
210            }
211        }
212
213        heap.into_sorted_vec()
214            .into_iter()
215            .map(|(count, item)| (item, count))
216            .collect()
217    }
218
219    /// Get items with exactly n occurrences
220    #[must_use]
221    pub fn items_with_count(&self, n: u64) -> Vec<&T> {
222        self.data
223            .iter()
224            .filter(|&(_, &count)| count == n)
225            .map(|(item, _)| item)
226            .collect()
227    }
228
229    /// Get the sum of all counts
230    #[must_use]
231    pub fn total_count(&self) -> u64 {
232        self.data.values().sum()
233    }
234
235    /// Check if any item occurs exactly n times
236    #[must_use]
237    pub fn has_count(&self, n: u64) -> bool {
238        self.data.values().any(|&count| count == n)
239    }
240
241    /// Add specialized method for single increment
242    #[allow(clippy::inline_always)]
243    #[inline(always)]
244    pub fn increment_by(&mut self, v: T, count: u64) {
245        match self.data.entry(v) {
246            Entry::Vacant(entry) => {
247                entry.insert(count);
248            }
249            Entry::Occupied(mut entry) => {
250                *entry.get_mut() += count;
251            }
252        }
253    }
254}
255
256impl Frequencies<Vec<u8>> {
257    /// Increment count for a byte slice key, avoiding allocation when key exists.
258    /// Uses hashbrown's `entry_ref(&[u8])`, which probes once with the borrowed
259    /// key and only allocates (`[u8]::to_owned()` -> `Vec<u8>`) on the vacant
260    /// branch. For low-cardinality columns (the common case), this eliminates
261    /// ~99% of allocations; for new keys it is a single hash+probe (std's
262    /// HashMap has no stable raw-entry API, so the old path hashed twice).
263    #[allow(clippy::inline_always)]
264    #[inline(always)]
265    pub fn add_borrowed(&mut self, v: &[u8]) {
266        *self.data.entry_ref(v).or_insert(0) += 1;
267    }
268
269    /// Increment by a count for a byte slice key, avoiding allocation when key exists.
270    #[allow(clippy::inline_always)]
271    #[inline(always)]
272    pub fn increment_by_borrowed(&mut self, v: &[u8], count: u64) {
273        *self.data.entry_ref(v).or_insert(0) += count;
274    }
275}
276
277impl<T: Eq + Hash> Commute for Frequencies<T> {
278    #[inline]
279    fn merge(&mut self, v: Frequencies<T>) {
280        // Reserve additional capacity to avoid reallocations
281        self.data.reserve(v.data.len());
282
283        for (k, v2) in v.data {
284            match self.data.entry(k) {
285                Entry::Vacant(v1) => {
286                    v1.insert(v2);
287                }
288                Entry::Occupied(mut v1) => {
289                    *v1.get_mut() += v2;
290                }
291            }
292        }
293    }
294}
295
296impl<T: Eq + Hash> Default for Frequencies<T> {
297    #[inline]
298    fn default() -> Frequencies<T> {
299        Frequencies {
300            data: HashMap::with_capacity(64),
301        }
302    }
303}
304
305impl<T: Eq + Hash> FromIterator<T> for Frequencies<T> {
306    #[inline]
307    fn from_iter<I: IntoIterator<Item = T>>(it: I) -> Frequencies<T> {
308        let mut v = Frequencies::new();
309        v.extend(it);
310        v
311    }
312}
313
314impl<T: Eq + Hash> Extend<T> for Frequencies<T> {
315    #[inline]
316    fn extend<I: IntoIterator<Item = T>>(&mut self, it: I) {
317        let iter = it.into_iter();
318        // Reserve capacity if size hint is available and reliable
319        if let (lower, Some(upper)) = iter.size_hint()
320            && lower == upper
321        {
322            // Exact size known - reserve capacity for new entries
323            // We don't know how many will be new vs existing, so reserve conservatively
324            self.data.reserve(lower.saturating_sub(self.data.len()));
325        }
326        for sample in iter {
327            self.add(sample);
328        }
329    }
330}
331
332/// An iterator over unique values in a frequencies count.
333pub struct UniqueValues<'a, K> {
334    data_keys: Keys<'a, K, u64>,
335}
336
337impl<'a, K> Iterator for UniqueValues<'a, K> {
338    type Item = &'a K;
339    #[inline]
340    fn next(&mut self) -> Option<Self::Item> {
341        self.data_keys.next()
342    }
343
344    // Forward the exact size hint from the underlying `Keys` iterator so that
345    // `collect`/`extend` can preallocate exactly instead of reallocating.
346    #[inline]
347    fn size_hint(&self) -> (usize, Option<usize>) {
348        self.data_keys.size_hint()
349    }
350}
351
352impl<K> ExactSizeIterator for UniqueValues<'_, K> {
353    #[inline]
354    fn len(&self) -> usize {
355        self.data_keys.len()
356    }
357}
358
359impl<K> std::iter::FusedIterator for UniqueValues<'_, K> {}
360
361#[cfg(test)]
362mod test {
363    use super::Frequencies;
364    use std::iter::FromIterator;
365
366    #[test]
367    fn ranked() {
368        let mut counts = Frequencies::new();
369        counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
370        let (most_count, most_total) = counts.most_frequent();
371        assert_eq!(most_count[0], (&2, 5));
372        assert_eq!(most_total, 11);
373        let (least_count, least_total) = counts.least_frequent();
374        assert_eq!(least_count[0], (&3, 1));
375        assert_eq!(least_total, 11);
376        assert_eq!(
377            counts.least_frequent(),
378            (vec![(&3, 1), (&1, 2), (&4, 3), (&2, 5)], 11)
379        );
380    }
381
382    #[test]
383    fn ranked2() {
384        let mut counts = Frequencies::new();
385        counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
386        let (most_count, most_total) = counts.par_frequent(false);
387        assert_eq!(most_count[0], (&2, 5));
388        assert_eq!(most_total, 11);
389        let (least_count, least_total) = counts.par_frequent(true);
390        assert_eq!(least_count[0], (&3, 1));
391        assert_eq!(least_total, 11);
392    }
393
394    #[test]
395    fn unique_values() {
396        let freqs = Frequencies::from_iter(vec![8, 6, 5, 1, 1, 2, 2, 2, 3, 4, 7, 4, 4]);
397        let mut unique: Vec<isize> = freqs.unique_values().copied().collect();
398        unique.sort_unstable();
399        assert_eq!(unique, vec![1, 2, 3, 4, 5, 6, 7, 8]);
400    }
401
402    #[test]
403    fn test_top_n() {
404        let mut freq = Frequencies::new();
405        freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
406
407        let top_2 = freq.top_n(2);
408        assert_eq!(top_2.len(), 2);
409        assert_eq!(top_2[0], (&4, 4)); // Most frequent
410        assert_eq!(top_2[1], (&1, 3)); // Second most frequent
411
412        let bottom_2 = freq.bottom_n(2);
413        assert_eq!(bottom_2.len(), 2);
414        assert_eq!(bottom_2[0], (&3, 1)); // Least frequent
415        assert_eq!(bottom_2[1], (&2, 2)); // Second least frequent
416    }
417
418    // top_n/bottom_n must select the SAME set and order as
419    // par_frequent(..) truncated to n, including the tie-break at the
420    // n/n+1 boundary (count primary, value ascending on ties). This is the
421    // invariant qsv's `frequency --limit N` fast path relies on.
422    #[test]
423    fn top_n_matches_par_frequent_all_ties() {
424        // All counts equal: every comparison is a pure value tie-break.
425        let mut freq = Frequencies::new();
426        freq.extend(vec![1usize, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
427        for n in 0..=10usize {
428            let (full_desc, _) = freq.par_frequent(false);
429            let expected_desc: Vec<(&usize, u64)> = full_desc.into_iter().take(n).collect();
430            assert_eq!(freq.top_n(n), expected_desc, "top_n({n}) all-ties mismatch");
431
432            let (full_asc, _) = freq.par_frequent(true);
433            let expected_asc: Vec<(&usize, u64)> = full_asc.into_iter().take(n).collect();
434            assert_eq!(
435                freq.bottom_n(n),
436                expected_asc,
437                "bottom_n({n}) all-ties mismatch"
438            );
439        }
440    }
441
442    #[test]
443    fn top_n_matches_par_frequent_boundary_ties() {
444        // Counts deliberately tie across the cutoff: values 10..19 all count 2,
445        // values 20..24 count 5, values 30..34 count 1.
446        let mut freq = Frequencies::new();
447        for v in 10..20usize {
448            freq.extend(vec![v, v]); // count 2
449        }
450        for v in 20..25usize {
451            freq.extend(vec![v; 5]); // count 5
452        }
453        for v in 30..35usize {
454            freq.extend(vec![v]); // count 1
455        }
456        for n in 0..=freq.len() + 2 {
457            let (full_desc, _) = freq.par_frequent(false);
458            let expected_desc: Vec<(&usize, u64)> = full_desc.into_iter().take(n).collect();
459            assert_eq!(
460                freq.top_n(n),
461                expected_desc,
462                "top_n({n}) boundary-tie mismatch"
463            );
464
465            let (full_asc, _) = freq.par_frequent(true);
466            let expected_asc: Vec<(&usize, u64)> = full_asc.into_iter().take(n).collect();
467            assert_eq!(
468                freq.bottom_n(n),
469                expected_asc,
470                "bottom_n({n}) boundary-tie mismatch"
471            );
472        }
473    }
474
475    #[test]
476    fn test_count_methods() {
477        let mut freq = Frequencies::new();
478        freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
479
480        // Test total_count()
481        assert_eq!(freq.total_count(), 10);
482
483        // Test has_count()
484        assert!(freq.has_count(3)); // 1 appears 3 times
485        assert!(freq.has_count(4)); // 4 appears 4 times
486        assert!(freq.has_count(1)); // 3 appears 1 time
487        assert!(!freq.has_count(5)); // No element appears 5 times
488
489        // Test items_with_count()
490        let items_with_3 = freq.items_with_count(3);
491        assert_eq!(items_with_3, vec![&1]); // Only 1 appears 3 times
492
493        let items_with_2 = freq.items_with_count(2);
494        assert_eq!(items_with_2, vec![&2]); // Only 2 appears 2 times
495
496        let items_with_1 = freq.items_with_count(1);
497        assert_eq!(items_with_1, vec![&3]); // Only 3 appears 1 time
498
499        let items_with_4 = freq.items_with_count(4);
500        assert_eq!(items_with_4, vec![&4]); // Only 4 appears 4 times
501
502        let items_with_5 = freq.items_with_count(5);
503        assert!(items_with_5.is_empty()); // No elements appear 5 times
504    }
505
506    #[test]
507    fn add_borrowed_inserts_new_key() {
508        let mut freq = Frequencies::<Vec<u8>>::new();
509        freq.add_borrowed(b"hello");
510        assert_eq!(freq.count(&b"hello".to_vec()), 1);
511        assert_eq!(freq.cardinality(), 1);
512    }
513
514    #[test]
515    fn add_borrowed_increments_existing_key() {
516        let mut freq = Frequencies::<Vec<u8>>::new();
517        freq.add_borrowed(b"hello");
518        freq.add_borrowed(b"hello");
519        freq.add_borrowed(b"hello");
520        assert_eq!(freq.count(&b"hello".to_vec()), 3);
521        assert_eq!(freq.cardinality(), 1);
522
523        // Also test increment_by_borrowed
524        freq.increment_by_borrowed(b"world", 5);
525        assert_eq!(freq.count(&b"world".to_vec()), 5);
526        freq.increment_by_borrowed(b"world", 3);
527        assert_eq!(freq.count(&b"world".to_vec()), 8);
528    }
529
530    #[test]
531    fn borrowed_owned_interop_for_same_key() {
532        let mut freq = Frequencies::<Vec<u8>>::new();
533        // Insert via owned add
534        freq.add(b"key".to_vec());
535        // Increment via borrowed add
536        freq.add_borrowed(b"key");
537        freq.increment_by_borrowed(b"key", 3);
538        // All methods should see the same accumulated count
539        assert_eq!(freq.count(&b"key".to_vec()), 5);
540        assert_eq!(freq.cardinality(), 1);
541    }
542}