Skip to main content

stats/
frequency.rs

1use std::collections::hash_map::{Entry, Keys};
2use std::hash::Hash;
3
4use foldhash::{HashMap, HashMapExt};
5
6use rayon::prelude::*;
7
8use crate::Commute;
9
10const PARALLEL_THRESHOLD: usize = 10_000;
11/// A commutative data structure for exact frequency counts.
12#[derive(Clone)]
13pub struct Frequencies<T> {
14    data: HashMap<T, u64>,
15}
16
17#[cfg(debug_assertions)]
18impl<T: std::fmt::Debug + Eq + Hash> std::fmt::Debug for Frequencies<T> {
19    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
20        write!(f, "{:?}", self.data)
21    }
22}
23
24impl<T: Eq + Hash> Frequencies<T> {
25    /// Create a new frequency table with no samples.
26    #[must_use]
27    pub fn new() -> Frequencies<T> {
28        Default::default()
29    }
30
31    // Add constructor with configurable capacity
32    #[must_use]
33    pub fn with_capacity(capacity: usize) -> Self {
34        Frequencies {
35            data: HashMap::with_capacity(capacity),
36        }
37    }
38
39    /// Add a value to the frequency table.
40    #[inline]
41    pub fn add(&mut self, v: T) {
42        *self.data.entry(v).or_insert(0) += 1;
43    }
44
45    /// Return the number of occurrences of `v` in the data.
46    #[inline]
47    #[must_use]
48    pub fn count(&self, v: &T) -> u64 {
49        self.data.get(v).copied().unwrap_or(0)
50    }
51
52    /// Return the cardinality (number of unique elements) in the data.
53    #[inline]
54    #[must_use]
55    pub fn cardinality(&self) -> u64 {
56        self.len() as u64
57    }
58
59    /// Collect counts and total in a single pass, reused by most/least_frequent.
60    fn collect_counts(&self) -> (Vec<(&T, u64)>, u64) {
61        let mut total_count = 0u64;
62        let counts: Vec<(&T, u64)> = self
63            .data
64            .iter()
65            .map(|(k, &v)| {
66                total_count += v;
67                (k, v)
68            })
69            .collect();
70        (counts, total_count)
71    }
72
73    /// Return a `Vec` of elements, their corresponding counts in
74    /// descending order, and the total count.
75    #[inline]
76    #[must_use]
77    pub fn most_frequent(&self) -> (Vec<(&T, u64)>, u64) {
78        let (mut counts, total_count) = self.collect_counts();
79        counts.sort_unstable_by(|&(_, c1), &(_, c2)| c2.cmp(&c1));
80        (counts, total_count)
81    }
82
83    /// Return a `Vec` of elements, their corresponding counts in
84    /// ascending order, and the total count.
85    #[inline]
86    #[must_use]
87    pub fn least_frequent(&self) -> (Vec<(&T, u64)>, u64) {
88        let (mut counts, total_count) = self.collect_counts();
89        counts.sort_unstable_by(|&(_, c1), &(_, c2)| c1.cmp(&c2));
90        (counts, total_count)
91    }
92
93    /// Return a `Vec` of elements, their corresponding counts in order
94    /// based on the `least` parameter, and the total count. Uses parallel sort.
95    #[inline]
96    #[must_use]
97    pub fn par_frequent(&self, least: bool) -> (Vec<(&T, u64)>, u64)
98    where
99        for<'a> (&'a T, u64): Send,
100        T: Ord,
101    {
102        let (mut counts, total_count) = self.collect_counts();
103        // sort by counts asc/desc
104        // if counts are equal, sort by values lexicographically
105        // We need to do this because otherwise the values are not guaranteed to be in order for
106        // equal counts
107        if least {
108            // return counts in ascending order
109            let sort_fn = |&(v1, c1): &(&T, u64), &(v2, c2): &(&T, u64)| {
110                c1.cmp(&c2).then_with(|| v1.cmp(v2))
111            };
112            if counts.len() < PARALLEL_THRESHOLD {
113                counts.sort_unstable_by(sort_fn);
114            } else {
115                counts.par_sort_unstable_by(sort_fn);
116            }
117        } else {
118            // return counts in descending order
119            let sort_fn = |&(v1, c1): &(&T, u64), &(v2, c2): &(&T, u64)| {
120                c2.cmp(&c1).then_with(|| v1.cmp(v2))
121            };
122            if counts.len() < PARALLEL_THRESHOLD {
123                counts.sort_unstable_by(sort_fn);
124            } else {
125                counts.par_sort_unstable_by(sort_fn);
126            }
127        }
128        (counts, total_count)
129    }
130
131    /// Returns the cardinality of the data.
132    #[must_use]
133    pub fn len(&self) -> usize {
134        self.data.len()
135    }
136
137    /// Returns true if there is no frequency/cardinality data.
138    #[must_use]
139    pub fn is_empty(&self) -> bool {
140        self.data.is_empty()
141    }
142
143    /// Return an iterator over the unique values of the data.
144    #[must_use]
145    pub fn unique_values(&self) -> UniqueValues<'_, T> {
146        UniqueValues {
147            data_keys: self.data.keys(),
148        }
149    }
150
151    /// Get the top N most frequent items without sorting the entire vector
152    /// This is much faster than `most_frequent()` when you only need a few items
153    #[must_use]
154    pub fn top_n(&self, n: usize) -> Vec<(&T, u64)>
155    where
156        T: Ord,
157    {
158        use std::collections::BinaryHeap;
159
160        // We use a min-heap of size n to keep track of the largest elements
161        let mut heap = BinaryHeap::with_capacity(n + 1);
162
163        for (item, count) in &self.data {
164            // Negate count because BinaryHeap is a max-heap
165            // and we want to remove smallest elements
166            heap.push(std::cmp::Reverse((*count, item)));
167
168            // Keep heap size at n
169            if heap.len() > n {
170                heap.pop();
171            }
172        }
173
174        // Convert to sorted vector
175        heap.into_sorted_vec()
176            .into_iter()
177            .map(|std::cmp::Reverse((count, item))| (item, count))
178            .collect()
179    }
180
181    /// Similar to `top_n` but for least frequent items
182    #[must_use]
183    pub fn bottom_n(&self, n: usize) -> Vec<(&T, u64)>
184    where
185        T: Ord,
186    {
187        use std::collections::BinaryHeap;
188
189        let mut heap = BinaryHeap::with_capacity(n + 1);
190
191        for (item, count) in &self.data {
192            heap.push((*count, item));
193            if heap.len() > n {
194                heap.pop();
195            }
196        }
197
198        heap.into_sorted_vec()
199            .into_iter()
200            .map(|(count, item)| (item, count))
201            .collect()
202    }
203
204    /// Get items with exactly n occurrences
205    #[must_use]
206    pub fn items_with_count(&self, n: u64) -> Vec<&T> {
207        self.data
208            .iter()
209            .filter(|&(_, &count)| count == n)
210            .map(|(item, _)| item)
211            .collect()
212    }
213
214    /// Get the sum of all counts
215    #[must_use]
216    pub fn total_count(&self) -> u64 {
217        self.data.values().sum()
218    }
219
220    /// Check if any item occurs exactly n times
221    #[must_use]
222    pub fn has_count(&self, n: u64) -> bool {
223        self.data.values().any(|&count| count == n)
224    }
225
226    /// Add specialized method for single increment
227    #[inline]
228    pub fn increment_by(&mut self, v: T, count: u64) {
229        match self.data.entry(v) {
230            Entry::Vacant(entry) => {
231                entry.insert(count);
232            }
233            Entry::Occupied(mut entry) => {
234                *entry.get_mut() += count;
235            }
236        }
237    }
238}
239
240impl Frequencies<Vec<u8>> {
241    /// Increment count for a byte slice key, avoiding allocation when key exists.
242    /// Uses borrowed lookup via `get_mut(&[u8])` before falling back to owned insert.
243    /// This works because `Vec<u8>: Borrow<[u8]>`, so HashMap accepts `&[u8]` for lookup.
244    /// For low-cardinality columns (the common case), this eliminates ~99% of allocations.
245    #[inline]
246    pub fn add_borrowed(&mut self, v: &[u8]) {
247        if let Some(count) = self.data.get_mut(v) {
248            *count += 1;
249        } else {
250            self.data.insert(v.to_vec(), 1);
251        }
252    }
253
254    /// Increment by a count for a byte slice key, avoiding allocation when key exists.
255    #[inline]
256    pub fn increment_by_borrowed(&mut self, v: &[u8], count: u64) {
257        if let Some(existing) = self.data.get_mut(v) {
258            *existing += count;
259        } else {
260            self.data.insert(v.to_vec(), count);
261        }
262    }
263}
264
265impl<T: Eq + Hash> Commute for Frequencies<T> {
266    #[inline]
267    fn merge(&mut self, v: Frequencies<T>) {
268        // Reserve additional capacity to avoid reallocations
269        self.data.reserve(v.data.len());
270
271        for (k, v2) in v.data {
272            match self.data.entry(k) {
273                Entry::Vacant(v1) => {
274                    v1.insert(v2);
275                }
276                Entry::Occupied(mut v1) => {
277                    *v1.get_mut() += v2;
278                }
279            }
280        }
281    }
282}
283
284impl<T: Eq + Hash> Default for Frequencies<T> {
285    #[inline]
286    fn default() -> Frequencies<T> {
287        Frequencies {
288            data: HashMap::with_capacity(64),
289        }
290    }
291}
292
293impl<T: Eq + Hash> FromIterator<T> for Frequencies<T> {
294    #[inline]
295    fn from_iter<I: IntoIterator<Item = T>>(it: I) -> Frequencies<T> {
296        let mut v = Frequencies::new();
297        v.extend(it);
298        v
299    }
300}
301
302impl<T: Eq + Hash> Extend<T> for Frequencies<T> {
303    #[inline]
304    fn extend<I: IntoIterator<Item = T>>(&mut self, it: I) {
305        let iter = it.into_iter();
306        // Reserve capacity if size hint is available and reliable
307        if let (lower, Some(upper)) = iter.size_hint()
308            && lower == upper
309        {
310            // Exact size known - reserve capacity for new entries
311            // We don't know how many will be new vs existing, so reserve conservatively
312            self.data.reserve(lower.saturating_sub(self.data.len()));
313        }
314        for sample in iter {
315            self.add(sample);
316        }
317    }
318}
319
320/// An iterator over unique values in a frequencies count.
321pub struct UniqueValues<'a, K> {
322    data_keys: Keys<'a, K, u64>,
323}
324
325impl<'a, K> Iterator for UniqueValues<'a, K> {
326    type Item = &'a K;
327    fn next(&mut self) -> Option<Self::Item> {
328        self.data_keys.next()
329    }
330}
331
332#[cfg(test)]
333mod test {
334    use super::Frequencies;
335    use std::iter::FromIterator;
336
337    #[test]
338    fn ranked() {
339        let mut counts = Frequencies::new();
340        counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
341        let (most_count, most_total) = counts.most_frequent();
342        assert_eq!(most_count[0], (&2, 5));
343        assert_eq!(most_total, 11);
344        let (least_count, least_total) = counts.least_frequent();
345        assert_eq!(least_count[0], (&3, 1));
346        assert_eq!(least_total, 11);
347        assert_eq!(
348            counts.least_frequent(),
349            (vec![(&3, 1), (&1, 2), (&4, 3), (&2, 5)], 11)
350        );
351    }
352
353    #[test]
354    fn ranked2() {
355        let mut counts = Frequencies::new();
356        counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
357        let (most_count, most_total) = counts.par_frequent(false);
358        assert_eq!(most_count[0], (&2, 5));
359        assert_eq!(most_total, 11);
360        let (least_count, least_total) = counts.par_frequent(true);
361        assert_eq!(least_count[0], (&3, 1));
362        assert_eq!(least_total, 11);
363    }
364
365    #[test]
366    fn unique_values() {
367        let freqs = Frequencies::from_iter(vec![8, 6, 5, 1, 1, 2, 2, 2, 3, 4, 7, 4, 4]);
368        let mut unique: Vec<isize> = freqs.unique_values().copied().collect();
369        unique.sort_unstable();
370        assert_eq!(unique, vec![1, 2, 3, 4, 5, 6, 7, 8]);
371    }
372
373    #[test]
374    fn test_top_n() {
375        let mut freq = Frequencies::new();
376        freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
377
378        let top_2 = freq.top_n(2);
379        assert_eq!(top_2.len(), 2);
380        assert_eq!(top_2[0], (&4, 4)); // Most frequent
381        assert_eq!(top_2[1], (&1, 3)); // Second most frequent
382
383        let bottom_2 = freq.bottom_n(2);
384        assert_eq!(bottom_2.len(), 2);
385        assert_eq!(bottom_2[0], (&3, 1)); // Least frequent
386        assert_eq!(bottom_2[1], (&2, 2)); // Second least frequent
387    }
388
389    #[test]
390    fn test_count_methods() {
391        let mut freq = Frequencies::new();
392        freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
393
394        // Test total_count()
395        assert_eq!(freq.total_count(), 10);
396
397        // Test has_count()
398        assert!(freq.has_count(3)); // 1 appears 3 times
399        assert!(freq.has_count(4)); // 4 appears 4 times
400        assert!(freq.has_count(1)); // 3 appears 1 time
401        assert!(!freq.has_count(5)); // No element appears 5 times
402
403        // Test items_with_count()
404        let items_with_3 = freq.items_with_count(3);
405        assert_eq!(items_with_3, vec![&1]); // Only 1 appears 3 times
406
407        let items_with_2 = freq.items_with_count(2);
408        assert_eq!(items_with_2, vec![&2]); // Only 2 appears 2 times
409
410        let items_with_1 = freq.items_with_count(1);
411        assert_eq!(items_with_1, vec![&3]); // Only 3 appears 1 time
412
413        let items_with_4 = freq.items_with_count(4);
414        assert_eq!(items_with_4, vec![&4]); // Only 4 appears 4 times
415
416        let items_with_5 = freq.items_with_count(5);
417        assert!(items_with_5.is_empty()); // No elements appear 5 times
418    }
419
420    #[test]
421    fn add_borrowed_inserts_new_key() {
422        let mut freq = Frequencies::<Vec<u8>>::new();
423        freq.add_borrowed(b"hello");
424        assert_eq!(freq.count(&b"hello".to_vec()), 1);
425        assert_eq!(freq.cardinality(), 1);
426    }
427
428    #[test]
429    fn add_borrowed_increments_existing_key() {
430        let mut freq = Frequencies::<Vec<u8>>::new();
431        freq.add_borrowed(b"hello");
432        freq.add_borrowed(b"hello");
433        freq.add_borrowed(b"hello");
434        assert_eq!(freq.count(&b"hello".to_vec()), 3);
435        assert_eq!(freq.cardinality(), 1);
436
437        // Also test increment_by_borrowed
438        freq.increment_by_borrowed(b"world", 5);
439        assert_eq!(freq.count(&b"world".to_vec()), 5);
440        freq.increment_by_borrowed(b"world", 3);
441        assert_eq!(freq.count(&b"world".to_vec()), 8);
442    }
443
444    #[test]
445    fn borrowed_owned_interop_for_same_key() {
446        let mut freq = Frequencies::<Vec<u8>>::new();
447        // Insert via owned add
448        freq.add(b"key".to_vec());
449        // Increment via borrowed add
450        freq.add_borrowed(b"key");
451        freq.increment_by_borrowed(b"key", 3);
452        // All methods should see the same accumulated count
453        assert_eq!(freq.count(&b"key".to_vec()), 5);
454        assert_eq!(freq.cardinality(), 1);
455    }
456}