Skip to main content

stats/
frequency.rs

1use std::hash::Hash;
2
3use hashbrown::HashMap;
4use hashbrown::hash_map::{Entry, Keys};
5
6use rayon::prelude::*;
7
8use crate::Commute;
9
10const PARALLEL_THRESHOLD: usize = 10_000;
11/// A commutative data structure for exact frequency counts.
12#[derive(Clone)]
13pub struct Frequencies<T> {
14    data: HashMap<T, u64>,
15}
16
17#[cfg(debug_assertions)]
18impl<T: std::fmt::Debug + Eq + Hash> std::fmt::Debug for Frequencies<T> {
19    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
20        write!(f, "{:?}", self.data)
21    }
22}
23
24impl<T: Eq + Hash> Frequencies<T> {
25    /// Create a new frequency table with no samples.
26    #[must_use]
27    pub fn new() -> Frequencies<T> {
28        Default::default()
29    }
30
31    // Add constructor with configurable capacity
32    #[must_use]
33    pub fn with_capacity(capacity: usize) -> Self {
34        Frequencies {
35            data: HashMap::with_capacity(capacity),
36        }
37    }
38
39    /// Add a value to the frequency table.
40    #[allow(clippy::inline_always)]
41    #[inline(always)]
42    pub fn add(&mut self, v: T) {
43        *self.data.entry(v).or_insert(0) += 1;
44    }
45
46    /// Return the number of occurrences of `v` in the data.
47    #[inline]
48    #[must_use]
49    pub fn count(&self, v: &T) -> u64 {
50        self.data.get(v).copied().unwrap_or(0)
51    }
52
53    /// Return the cardinality (number of unique elements) in the data.
54    #[inline]
55    #[must_use]
56    pub fn cardinality(&self) -> u64 {
57        self.len() as u64
58    }
59
60    /// Collect counts and total in a single pass, reused by `most/least_frequent`.
61    fn collect_counts(&self) -> (Vec<(&T, u64)>, u64) {
62        let mut total_count = 0u64;
63        let counts: Vec<(&T, u64)> = self
64            .data
65            .iter()
66            .map(|(k, &v)| {
67                total_count += v;
68                (k, v)
69            })
70            .collect();
71        (counts, total_count)
72    }
73
74    /// Return a `Vec` of elements, their corresponding counts in
75    /// descending order, and the total count.
76    #[inline]
77    #[must_use]
78    pub fn most_frequent(&self) -> (Vec<(&T, u64)>, u64) {
79        let (mut counts, total_count) = self.collect_counts();
80        counts.sort_unstable_by_key(|&(_, c)| std::cmp::Reverse(c));
81        (counts, total_count)
82    }
83
84    /// Return a `Vec` of elements, their corresponding counts in
85    /// ascending order, and the total count.
86    #[inline]
87    #[must_use]
88    pub fn least_frequent(&self) -> (Vec<(&T, u64)>, u64) {
89        let (mut counts, total_count) = self.collect_counts();
90        counts.sort_unstable_by_key(|&(_, c)| c);
91        (counts, total_count)
92    }
93
94    /// Return a `Vec` of elements, their corresponding counts in order
95    /// based on the `least` parameter, and the total count. Uses parallel sort.
96    #[inline]
97    #[must_use]
98    pub fn par_frequent(&self, least: bool) -> (Vec<(&T, u64)>, u64)
99    where
100        for<'a> (&'a T, u64): Send,
101        T: Ord,
102    {
103        let (mut counts, total_count) = self.collect_counts();
104        // sort by counts asc/desc
105        // if counts are equal, sort by values lexicographically
106        // We need to do this because otherwise the values are not guaranteed to be in order for
107        // equal counts
108        if least {
109            // return counts in ascending order
110            let sort_fn =
111                |&(v1, c1): &(&T, u64), &(v2, c2): &(&T, u64)| c1.cmp(&c2).then_with(|| v1.cmp(v2));
112            if counts.len() < PARALLEL_THRESHOLD {
113                counts.sort_unstable_by(sort_fn);
114            } else {
115                counts.par_sort_unstable_by(sort_fn);
116            }
117        } else {
118            // return counts in descending order
119            let sort_fn =
120                |&(v1, c1): &(&T, u64), &(v2, c2): &(&T, u64)| c2.cmp(&c1).then_with(|| v1.cmp(v2));
121            if counts.len() < PARALLEL_THRESHOLD {
122                counts.sort_unstable_by(sort_fn);
123            } else {
124                counts.par_sort_unstable_by(sort_fn);
125            }
126        }
127        (counts, total_count)
128    }
129
130    /// Returns the cardinality of the data.
131    #[must_use]
132    pub fn len(&self) -> usize {
133        self.data.len()
134    }
135
136    /// Returns true if there is no frequency/cardinality data.
137    #[must_use]
138    pub fn is_empty(&self) -> bool {
139        self.data.is_empty()
140    }
141
142    /// Return an iterator over the unique values of the data.
143    #[must_use]
144    pub fn unique_values(&self) -> UniqueValues<'_, T> {
145        UniqueValues {
146            data_keys: self.data.keys(),
147        }
148    }
149
150    /// Get the top N most frequent items without sorting the entire vector
151    /// This is much faster than `most_frequent()` when you only need a few items
152    #[must_use]
153    pub fn top_n(&self, n: usize) -> Vec<(&T, u64)>
154    where
155        T: Ord,
156    {
157        use std::collections::BinaryHeap;
158
159        // Min-heap (via Reverse) of the n largest elements seen so far.
160        // peek() returns the smallest entry currently in the top-N — replace it
161        // only when a strictly larger count comes in. Avoids a push+pop per
162        // rejected element on high-cardinality inputs with small n.
163        let mut heap = BinaryHeap::with_capacity(n);
164
165        for (item, count) in &self.data {
166            let candidate = (*count, item);
167            if heap.len() < n {
168                heap.push(std::cmp::Reverse(candidate));
169            } else if let Some(top) = heap.peek()
170                && candidate > top.0
171            {
172                heap.pop();
173                heap.push(std::cmp::Reverse(candidate));
174            }
175        }
176
177        // Convert to sorted vector
178        heap.into_sorted_vec()
179            .into_iter()
180            .map(|std::cmp::Reverse((count, item))| (item, count))
181            .collect()
182    }
183
184    /// Similar to `top_n` but for least frequent items
185    #[must_use]
186    pub fn bottom_n(&self, n: usize) -> Vec<(&T, u64)>
187    where
188        T: Ord,
189    {
190        use std::collections::BinaryHeap;
191
192        // Max-heap of the n smallest elements seen so far. Mirror of top_n.
193        let mut heap = BinaryHeap::with_capacity(n);
194
195        for (item, count) in &self.data {
196            let candidate = (*count, item);
197            if heap.len() < n {
198                heap.push(candidate);
199            } else if let Some(top) = heap.peek()
200                && candidate < *top
201            {
202                heap.pop();
203                heap.push(candidate);
204            }
205        }
206
207        heap.into_sorted_vec()
208            .into_iter()
209            .map(|(count, item)| (item, count))
210            .collect()
211    }
212
213    /// Get items with exactly n occurrences
214    #[must_use]
215    pub fn items_with_count(&self, n: u64) -> Vec<&T> {
216        self.data
217            .iter()
218            .filter(|&(_, &count)| count == n)
219            .map(|(item, _)| item)
220            .collect()
221    }
222
223    /// Get the sum of all counts
224    #[must_use]
225    pub fn total_count(&self) -> u64 {
226        self.data.values().sum()
227    }
228
229    /// Check if any item occurs exactly n times
230    #[must_use]
231    pub fn has_count(&self, n: u64) -> bool {
232        self.data.values().any(|&count| count == n)
233    }
234
235    /// Add specialized method for single increment
236    #[allow(clippy::inline_always)]
237    #[inline(always)]
238    pub fn increment_by(&mut self, v: T, count: u64) {
239        match self.data.entry(v) {
240            Entry::Vacant(entry) => {
241                entry.insert(count);
242            }
243            Entry::Occupied(mut entry) => {
244                *entry.get_mut() += count;
245            }
246        }
247    }
248}
249
250impl Frequencies<Vec<u8>> {
251    /// Increment count for a byte slice key, avoiding allocation when key exists.
252    /// Uses hashbrown's `entry_ref(&[u8])`, which probes once with the borrowed
253    /// key and only allocates (`[u8]::to_owned()` -> `Vec<u8>`) on the vacant
254    /// branch. For low-cardinality columns (the common case), this eliminates
255    /// ~99% of allocations; for new keys it is a single hash+probe (std's
256    /// HashMap has no stable raw-entry API, so the old path hashed twice).
257    #[allow(clippy::inline_always)]
258    #[inline(always)]
259    pub fn add_borrowed(&mut self, v: &[u8]) {
260        *self.data.entry_ref(v).or_insert(0) += 1;
261    }
262
263    /// Increment by a count for a byte slice key, avoiding allocation when key exists.
264    #[allow(clippy::inline_always)]
265    #[inline(always)]
266    pub fn increment_by_borrowed(&mut self, v: &[u8], count: u64) {
267        *self.data.entry_ref(v).or_insert(0) += count;
268    }
269}
270
271impl<T: Eq + Hash> Commute for Frequencies<T> {
272    #[inline]
273    fn merge(&mut self, v: Frequencies<T>) {
274        // Reserve additional capacity to avoid reallocations
275        self.data.reserve(v.data.len());
276
277        for (k, v2) in v.data {
278            match self.data.entry(k) {
279                Entry::Vacant(v1) => {
280                    v1.insert(v2);
281                }
282                Entry::Occupied(mut v1) => {
283                    *v1.get_mut() += v2;
284                }
285            }
286        }
287    }
288}
289
290impl<T: Eq + Hash> Default for Frequencies<T> {
291    #[inline]
292    fn default() -> Frequencies<T> {
293        Frequencies {
294            data: HashMap::with_capacity(64),
295        }
296    }
297}
298
299impl<T: Eq + Hash> FromIterator<T> for Frequencies<T> {
300    #[inline]
301    fn from_iter<I: IntoIterator<Item = T>>(it: I) -> Frequencies<T> {
302        let mut v = Frequencies::new();
303        v.extend(it);
304        v
305    }
306}
307
308impl<T: Eq + Hash> Extend<T> for Frequencies<T> {
309    #[inline]
310    fn extend<I: IntoIterator<Item = T>>(&mut self, it: I) {
311        let iter = it.into_iter();
312        // Reserve capacity if size hint is available and reliable
313        if let (lower, Some(upper)) = iter.size_hint()
314            && lower == upper
315        {
316            // Exact size known - reserve capacity for new entries
317            // We don't know how many will be new vs existing, so reserve conservatively
318            self.data.reserve(lower.saturating_sub(self.data.len()));
319        }
320        for sample in iter {
321            self.add(sample);
322        }
323    }
324}
325
326/// An iterator over unique values in a frequencies count.
327pub struct UniqueValues<'a, K> {
328    data_keys: Keys<'a, K, u64>,
329}
330
331impl<'a, K> Iterator for UniqueValues<'a, K> {
332    type Item = &'a K;
333    fn next(&mut self) -> Option<Self::Item> {
334        self.data_keys.next()
335    }
336}
337
338#[cfg(test)]
339mod test {
340    use super::Frequencies;
341    use std::iter::FromIterator;
342
343    #[test]
344    fn ranked() {
345        let mut counts = Frequencies::new();
346        counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
347        let (most_count, most_total) = counts.most_frequent();
348        assert_eq!(most_count[0], (&2, 5));
349        assert_eq!(most_total, 11);
350        let (least_count, least_total) = counts.least_frequent();
351        assert_eq!(least_count[0], (&3, 1));
352        assert_eq!(least_total, 11);
353        assert_eq!(
354            counts.least_frequent(),
355            (vec![(&3, 1), (&1, 2), (&4, 3), (&2, 5)], 11)
356        );
357    }
358
359    #[test]
360    fn ranked2() {
361        let mut counts = Frequencies::new();
362        counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
363        let (most_count, most_total) = counts.par_frequent(false);
364        assert_eq!(most_count[0], (&2, 5));
365        assert_eq!(most_total, 11);
366        let (least_count, least_total) = counts.par_frequent(true);
367        assert_eq!(least_count[0], (&3, 1));
368        assert_eq!(least_total, 11);
369    }
370
371    #[test]
372    fn unique_values() {
373        let freqs = Frequencies::from_iter(vec![8, 6, 5, 1, 1, 2, 2, 2, 3, 4, 7, 4, 4]);
374        let mut unique: Vec<isize> = freqs.unique_values().copied().collect();
375        unique.sort_unstable();
376        assert_eq!(unique, vec![1, 2, 3, 4, 5, 6, 7, 8]);
377    }
378
379    #[test]
380    fn test_top_n() {
381        let mut freq = Frequencies::new();
382        freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
383
384        let top_2 = freq.top_n(2);
385        assert_eq!(top_2.len(), 2);
386        assert_eq!(top_2[0], (&4, 4)); // Most frequent
387        assert_eq!(top_2[1], (&1, 3)); // Second most frequent
388
389        let bottom_2 = freq.bottom_n(2);
390        assert_eq!(bottom_2.len(), 2);
391        assert_eq!(bottom_2[0], (&3, 1)); // Least frequent
392        assert_eq!(bottom_2[1], (&2, 2)); // Second least frequent
393    }
394
395    #[test]
396    fn test_count_methods() {
397        let mut freq = Frequencies::new();
398        freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
399
400        // Test total_count()
401        assert_eq!(freq.total_count(), 10);
402
403        // Test has_count()
404        assert!(freq.has_count(3)); // 1 appears 3 times
405        assert!(freq.has_count(4)); // 4 appears 4 times
406        assert!(freq.has_count(1)); // 3 appears 1 time
407        assert!(!freq.has_count(5)); // No element appears 5 times
408
409        // Test items_with_count()
410        let items_with_3 = freq.items_with_count(3);
411        assert_eq!(items_with_3, vec![&1]); // Only 1 appears 3 times
412
413        let items_with_2 = freq.items_with_count(2);
414        assert_eq!(items_with_2, vec![&2]); // Only 2 appears 2 times
415
416        let items_with_1 = freq.items_with_count(1);
417        assert_eq!(items_with_1, vec![&3]); // Only 3 appears 1 time
418
419        let items_with_4 = freq.items_with_count(4);
420        assert_eq!(items_with_4, vec![&4]); // Only 4 appears 4 times
421
422        let items_with_5 = freq.items_with_count(5);
423        assert!(items_with_5.is_empty()); // No elements appear 5 times
424    }
425
426    #[test]
427    fn add_borrowed_inserts_new_key() {
428        let mut freq = Frequencies::<Vec<u8>>::new();
429        freq.add_borrowed(b"hello");
430        assert_eq!(freq.count(&b"hello".to_vec()), 1);
431        assert_eq!(freq.cardinality(), 1);
432    }
433
434    #[test]
435    fn add_borrowed_increments_existing_key() {
436        let mut freq = Frequencies::<Vec<u8>>::new();
437        freq.add_borrowed(b"hello");
438        freq.add_borrowed(b"hello");
439        freq.add_borrowed(b"hello");
440        assert_eq!(freq.count(&b"hello".to_vec()), 3);
441        assert_eq!(freq.cardinality(), 1);
442
443        // Also test increment_by_borrowed
444        freq.increment_by_borrowed(b"world", 5);
445        assert_eq!(freq.count(&b"world".to_vec()), 5);
446        freq.increment_by_borrowed(b"world", 3);
447        assert_eq!(freq.count(&b"world".to_vec()), 8);
448    }
449
450    #[test]
451    fn borrowed_owned_interop_for_same_key() {
452        let mut freq = Frequencies::<Vec<u8>>::new();
453        // Insert via owned add
454        freq.add(b"key".to_vec());
455        // Increment via borrowed add
456        freq.add_borrowed(b"key");
457        freq.increment_by_borrowed(b"key", 3);
458        // All methods should see the same accumulated count
459        assert_eq!(freq.count(&b"key".to_vec()), 5);
460        assert_eq!(freq.cardinality(), 1);
461    }
462}