Skip to main content

stats/
frequency.rs

1use std::collections::hash_map::{Entry, Keys};
2use std::hash::Hash;
3
4use foldhash::{HashMap, HashMapExt};
5
6use rayon::prelude::*;
7
8use crate::Commute;
9
10const PARALLEL_THRESHOLD: usize = 10_000;
11/// A commutative data structure for exact frequency counts.
12#[derive(Clone)]
13pub struct Frequencies<T> {
14    data: HashMap<T, u64>,
15}
16
17#[cfg(debug_assertions)]
18impl<T: std::fmt::Debug + Eq + Hash> std::fmt::Debug for Frequencies<T> {
19    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
20        write!(f, "{:?}", self.data)
21    }
22}
23
24impl<T: Eq + Hash> Frequencies<T> {
25    /// Create a new frequency table with no samples.
26    #[must_use]
27    pub fn new() -> Frequencies<T> {
28        Default::default()
29    }
30
31    // Add constructor with configurable capacity
32    #[must_use]
33    pub fn with_capacity(capacity: usize) -> Self {
34        Frequencies {
35            data: HashMap::with_capacity(capacity),
36        }
37    }
38
39    /// Add a value to the frequency table.
40    #[inline]
41    pub fn add(&mut self, v: T) {
42        *self.data.entry(v).or_insert(0) += 1;
43    }
44
45    /// Return the number of occurrences of `v` in the data.
46    #[inline]
47    #[must_use]
48    pub fn count(&self, v: &T) -> u64 {
49        self.data.get(v).copied().unwrap_or(0)
50    }
51
52    /// Return the cardinality (number of unique elements) in the data.
53    #[inline]
54    #[must_use]
55    pub fn cardinality(&self) -> u64 {
56        self.len() as u64
57    }
58
59    /// Collect counts and total in a single pass, reused by `most/least_frequent`.
60    fn collect_counts(&self) -> (Vec<(&T, u64)>, u64) {
61        let mut total_count = 0u64;
62        let counts: Vec<(&T, u64)> = self
63            .data
64            .iter()
65            .map(|(k, &v)| {
66                total_count += v;
67                (k, v)
68            })
69            .collect();
70        (counts, total_count)
71    }
72
73    /// Return a `Vec` of elements, their corresponding counts in
74    /// descending order, and the total count.
75    #[inline]
76    #[must_use]
77    pub fn most_frequent(&self) -> (Vec<(&T, u64)>, u64) {
78        let (mut counts, total_count) = self.collect_counts();
79        counts.sort_unstable_by_key(|&(_, c)| std::cmp::Reverse(c));
80        (counts, total_count)
81    }
82
83    /// Return a `Vec` of elements, their corresponding counts in
84    /// ascending order, and the total count.
85    #[inline]
86    #[must_use]
87    pub fn least_frequent(&self) -> (Vec<(&T, u64)>, u64) {
88        let (mut counts, total_count) = self.collect_counts();
89        counts.sort_unstable_by_key(|&(_, c)| c);
90        (counts, total_count)
91    }
92
93    /// Return a `Vec` of elements, their corresponding counts in order
94    /// based on the `least` parameter, and the total count. Uses parallel sort.
95    #[inline]
96    #[must_use]
97    pub fn par_frequent(&self, least: bool) -> (Vec<(&T, u64)>, u64)
98    where
99        for<'a> (&'a T, u64): Send,
100        T: Ord,
101    {
102        let (mut counts, total_count) = self.collect_counts();
103        // sort by counts asc/desc
104        // if counts are equal, sort by values lexicographically
105        // We need to do this because otherwise the values are not guaranteed to be in order for
106        // equal counts
107        if least {
108            // return counts in ascending order
109            let sort_fn =
110                |&(v1, c1): &(&T, u64), &(v2, c2): &(&T, u64)| c1.cmp(&c2).then_with(|| v1.cmp(v2));
111            if counts.len() < PARALLEL_THRESHOLD {
112                counts.sort_unstable_by(sort_fn);
113            } else {
114                counts.par_sort_unstable_by(sort_fn);
115            }
116        } else {
117            // return counts in descending order
118            let sort_fn =
119                |&(v1, c1): &(&T, u64), &(v2, c2): &(&T, u64)| c2.cmp(&c1).then_with(|| v1.cmp(v2));
120            if counts.len() < PARALLEL_THRESHOLD {
121                counts.sort_unstable_by(sort_fn);
122            } else {
123                counts.par_sort_unstable_by(sort_fn);
124            }
125        }
126        (counts, total_count)
127    }
128
129    /// Returns the cardinality of the data.
130    #[must_use]
131    pub fn len(&self) -> usize {
132        self.data.len()
133    }
134
135    /// Returns true if there is no frequency/cardinality data.
136    #[must_use]
137    pub fn is_empty(&self) -> bool {
138        self.data.is_empty()
139    }
140
141    /// Return an iterator over the unique values of the data.
142    #[must_use]
143    pub fn unique_values(&self) -> UniqueValues<'_, T> {
144        UniqueValues {
145            data_keys: self.data.keys(),
146        }
147    }
148
149    /// Get the top N most frequent items without sorting the entire vector
150    /// This is much faster than `most_frequent()` when you only need a few items
151    #[must_use]
152    pub fn top_n(&self, n: usize) -> Vec<(&T, u64)>
153    where
154        T: Ord,
155    {
156        use std::collections::BinaryHeap;
157
158        // Min-heap (via Reverse) of the n largest elements seen so far.
159        // peek() returns the smallest entry currently in the top-N — replace it
160        // only when a strictly larger count comes in. Avoids a push+pop per
161        // rejected element on high-cardinality inputs with small n.
162        let mut heap = BinaryHeap::with_capacity(n);
163
164        for (item, count) in &self.data {
165            let candidate = (*count, item);
166            if heap.len() < n {
167                heap.push(std::cmp::Reverse(candidate));
168            } else if let Some(top) = heap.peek()
169                && candidate > top.0
170            {
171                heap.pop();
172                heap.push(std::cmp::Reverse(candidate));
173            }
174        }
175
176        // Convert to sorted vector
177        heap.into_sorted_vec()
178            .into_iter()
179            .map(|std::cmp::Reverse((count, item))| (item, count))
180            .collect()
181    }
182
183    /// Similar to `top_n` but for least frequent items
184    #[must_use]
185    pub fn bottom_n(&self, n: usize) -> Vec<(&T, u64)>
186    where
187        T: Ord,
188    {
189        use std::collections::BinaryHeap;
190
191        // Max-heap of the n smallest elements seen so far. Mirror of top_n.
192        let mut heap = BinaryHeap::with_capacity(n);
193
194        for (item, count) in &self.data {
195            let candidate = (*count, item);
196            if heap.len() < n {
197                heap.push(candidate);
198            } else if let Some(top) = heap.peek()
199                && candidate < *top
200            {
201                heap.pop();
202                heap.push(candidate);
203            }
204        }
205
206        heap.into_sorted_vec()
207            .into_iter()
208            .map(|(count, item)| (item, count))
209            .collect()
210    }
211
212    /// Get items with exactly n occurrences
213    #[must_use]
214    pub fn items_with_count(&self, n: u64) -> Vec<&T> {
215        self.data
216            .iter()
217            .filter(|&(_, &count)| count == n)
218            .map(|(item, _)| item)
219            .collect()
220    }
221
222    /// Get the sum of all counts
223    #[must_use]
224    pub fn total_count(&self) -> u64 {
225        self.data.values().sum()
226    }
227
228    /// Check if any item occurs exactly n times
229    #[must_use]
230    pub fn has_count(&self, n: u64) -> bool {
231        self.data.values().any(|&count| count == n)
232    }
233
234    /// Add specialized method for single increment
235    #[inline]
236    pub fn increment_by(&mut self, v: T, count: u64) {
237        match self.data.entry(v) {
238            Entry::Vacant(entry) => {
239                entry.insert(count);
240            }
241            Entry::Occupied(mut entry) => {
242                *entry.get_mut() += count;
243            }
244        }
245    }
246}
247
248impl Frequencies<Vec<u8>> {
249    /// Increment count for a byte slice key, avoiding allocation when key exists.
250    /// Uses borrowed lookup via `get_mut(&[u8])` before falling back to owned insert.
251    /// This works because `Vec<u8>: Borrow<[u8]>`, so `HashMap` accepts `&[u8]` for lookup.
252    /// For low-cardinality columns (the common case), this eliminates ~99% of allocations.
253    #[inline]
254    pub fn add_borrowed(&mut self, v: &[u8]) {
255        if let Some(count) = self.data.get_mut(v) {
256            *count += 1;
257        } else {
258            self.data.insert(v.to_vec(), 1);
259        }
260    }
261
262    /// Increment by a count for a byte slice key, avoiding allocation when key exists.
263    #[inline]
264    pub fn increment_by_borrowed(&mut self, v: &[u8], count: u64) {
265        if let Some(existing) = self.data.get_mut(v) {
266            *existing += count;
267        } else {
268            self.data.insert(v.to_vec(), count);
269        }
270    }
271}
272
273impl<T: Eq + Hash> Commute for Frequencies<T> {
274    #[inline]
275    fn merge(&mut self, v: Frequencies<T>) {
276        // Reserve additional capacity to avoid reallocations
277        self.data.reserve(v.data.len());
278
279        for (k, v2) in v.data {
280            match self.data.entry(k) {
281                Entry::Vacant(v1) => {
282                    v1.insert(v2);
283                }
284                Entry::Occupied(mut v1) => {
285                    *v1.get_mut() += v2;
286                }
287            }
288        }
289    }
290}
291
292impl<T: Eq + Hash> Default for Frequencies<T> {
293    #[inline]
294    fn default() -> Frequencies<T> {
295        Frequencies {
296            data: HashMap::with_capacity(64),
297        }
298    }
299}
300
301impl<T: Eq + Hash> FromIterator<T> for Frequencies<T> {
302    #[inline]
303    fn from_iter<I: IntoIterator<Item = T>>(it: I) -> Frequencies<T> {
304        let mut v = Frequencies::new();
305        v.extend(it);
306        v
307    }
308}
309
310impl<T: Eq + Hash> Extend<T> for Frequencies<T> {
311    #[inline]
312    fn extend<I: IntoIterator<Item = T>>(&mut self, it: I) {
313        let iter = it.into_iter();
314        // Reserve capacity if size hint is available and reliable
315        if let (lower, Some(upper)) = iter.size_hint()
316            && lower == upper
317        {
318            // Exact size known - reserve capacity for new entries
319            // We don't know how many will be new vs existing, so reserve conservatively
320            self.data.reserve(lower.saturating_sub(self.data.len()));
321        }
322        for sample in iter {
323            self.add(sample);
324        }
325    }
326}
327
328/// An iterator over unique values in a frequencies count.
329pub struct UniqueValues<'a, K> {
330    data_keys: Keys<'a, K, u64>,
331}
332
333impl<'a, K> Iterator for UniqueValues<'a, K> {
334    type Item = &'a K;
335    fn next(&mut self) -> Option<Self::Item> {
336        self.data_keys.next()
337    }
338}
339
340#[cfg(test)]
341mod test {
342    use super::Frequencies;
343    use std::iter::FromIterator;
344
345    #[test]
346    fn ranked() {
347        let mut counts = Frequencies::new();
348        counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
349        let (most_count, most_total) = counts.most_frequent();
350        assert_eq!(most_count[0], (&2, 5));
351        assert_eq!(most_total, 11);
352        let (least_count, least_total) = counts.least_frequent();
353        assert_eq!(least_count[0], (&3, 1));
354        assert_eq!(least_total, 11);
355        assert_eq!(
356            counts.least_frequent(),
357            (vec![(&3, 1), (&1, 2), (&4, 3), (&2, 5)], 11)
358        );
359    }
360
361    #[test]
362    fn ranked2() {
363        let mut counts = Frequencies::new();
364        counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
365        let (most_count, most_total) = counts.par_frequent(false);
366        assert_eq!(most_count[0], (&2, 5));
367        assert_eq!(most_total, 11);
368        let (least_count, least_total) = counts.par_frequent(true);
369        assert_eq!(least_count[0], (&3, 1));
370        assert_eq!(least_total, 11);
371    }
372
373    #[test]
374    fn unique_values() {
375        let freqs = Frequencies::from_iter(vec![8, 6, 5, 1, 1, 2, 2, 2, 3, 4, 7, 4, 4]);
376        let mut unique: Vec<isize> = freqs.unique_values().copied().collect();
377        unique.sort_unstable();
378        assert_eq!(unique, vec![1, 2, 3, 4, 5, 6, 7, 8]);
379    }
380
381    #[test]
382    fn test_top_n() {
383        let mut freq = Frequencies::new();
384        freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
385
386        let top_2 = freq.top_n(2);
387        assert_eq!(top_2.len(), 2);
388        assert_eq!(top_2[0], (&4, 4)); // Most frequent
389        assert_eq!(top_2[1], (&1, 3)); // Second most frequent
390
391        let bottom_2 = freq.bottom_n(2);
392        assert_eq!(bottom_2.len(), 2);
393        assert_eq!(bottom_2[0], (&3, 1)); // Least frequent
394        assert_eq!(bottom_2[1], (&2, 2)); // Second least frequent
395    }
396
397    #[test]
398    fn test_count_methods() {
399        let mut freq = Frequencies::new();
400        freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
401
402        // Test total_count()
403        assert_eq!(freq.total_count(), 10);
404
405        // Test has_count()
406        assert!(freq.has_count(3)); // 1 appears 3 times
407        assert!(freq.has_count(4)); // 4 appears 4 times
408        assert!(freq.has_count(1)); // 3 appears 1 time
409        assert!(!freq.has_count(5)); // No element appears 5 times
410
411        // Test items_with_count()
412        let items_with_3 = freq.items_with_count(3);
413        assert_eq!(items_with_3, vec![&1]); // Only 1 appears 3 times
414
415        let items_with_2 = freq.items_with_count(2);
416        assert_eq!(items_with_2, vec![&2]); // Only 2 appears 2 times
417
418        let items_with_1 = freq.items_with_count(1);
419        assert_eq!(items_with_1, vec![&3]); // Only 3 appears 1 time
420
421        let items_with_4 = freq.items_with_count(4);
422        assert_eq!(items_with_4, vec![&4]); // Only 4 appears 4 times
423
424        let items_with_5 = freq.items_with_count(5);
425        assert!(items_with_5.is_empty()); // No elements appear 5 times
426    }
427
428    #[test]
429    fn add_borrowed_inserts_new_key() {
430        let mut freq = Frequencies::<Vec<u8>>::new();
431        freq.add_borrowed(b"hello");
432        assert_eq!(freq.count(&b"hello".to_vec()), 1);
433        assert_eq!(freq.cardinality(), 1);
434    }
435
436    #[test]
437    fn add_borrowed_increments_existing_key() {
438        let mut freq = Frequencies::<Vec<u8>>::new();
439        freq.add_borrowed(b"hello");
440        freq.add_borrowed(b"hello");
441        freq.add_borrowed(b"hello");
442        assert_eq!(freq.count(&b"hello".to_vec()), 3);
443        assert_eq!(freq.cardinality(), 1);
444
445        // Also test increment_by_borrowed
446        freq.increment_by_borrowed(b"world", 5);
447        assert_eq!(freq.count(&b"world".to_vec()), 5);
448        freq.increment_by_borrowed(b"world", 3);
449        assert_eq!(freq.count(&b"world".to_vec()), 8);
450    }
451
452    #[test]
453    fn borrowed_owned_interop_for_same_key() {
454        let mut freq = Frequencies::<Vec<u8>>::new();
455        // Insert via owned add
456        freq.add(b"key".to_vec());
457        // Increment via borrowed add
458        freq.add_borrowed(b"key");
459        freq.increment_by_borrowed(b"key", 3);
460        // All methods should see the same accumulated count
461        assert_eq!(freq.count(&b"key".to_vec()), 5);
462        assert_eq!(freq.cardinality(), 1);
463    }
464}