Skip to main content

stats/
frequency.rs

1use std::collections::hash_map::{Entry, Keys};
2use std::hash::Hash;
3
4use foldhash::{HashMap, HashMapExt};
5
6use rayon::prelude::*;
7
8use crate::Commute;
9/// A commutative data structure for exact frequency counts.
10#[derive(Clone)]
11pub struct Frequencies<T> {
12    data: HashMap<T, u64>,
13}
14
15#[cfg(debug_assertions)]
16impl<T: std::fmt::Debug + Eq + Hash> std::fmt::Debug for Frequencies<T> {
17    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
18        write!(f, "{:?}", self.data)
19    }
20}
21
22impl<T: Eq + Hash> Frequencies<T> {
23    /// Create a new frequency table with no samples.
24    #[must_use]
25    pub fn new() -> Frequencies<T> {
26        Default::default()
27    }
28
29    // Add constructor with configurable capacity
30    #[must_use]
31    pub fn with_capacity(capacity: usize) -> Self {
32        Frequencies {
33            data: HashMap::with_capacity(capacity),
34        }
35    }
36
37    /// Add a value to the frequency table.
38    #[inline]
39    pub fn add(&mut self, v: T) {
40        *self.data.entry(v).or_insert(0) += 1;
41    }
42
43    /// Return the number of occurrences of `v` in the data.
44    #[inline]
45    #[must_use]
46    pub fn count(&self, v: &T) -> u64 {
47        self.data.get(v).copied().unwrap_or(0)
48    }
49
50    /// Return the cardinality (number of unique elements) in the data.
51    #[inline]
52    #[must_use]
53    pub fn cardinality(&self) -> u64 {
54        self.len() as u64
55    }
56
57    /// Return a `Vec` of elements, their corresponding counts in
58    /// descending order, and the total count.
59    #[inline]
60    #[must_use]
61    pub fn most_frequent(&self) -> (Vec<(&T, u64)>, u64) {
62        let len = self.data.len();
63        let total_count: u64 = self.data.values().sum();
64        let mut counts = Vec::with_capacity(len);
65
66        for (k, &v) in &self.data {
67            counts.push((k, v));
68        }
69        counts.sort_unstable_by(|&(_, c1), &(_, c2)| c2.cmp(&c1));
70        (counts, total_count)
71    }
72
73    /// Return a `Vec` of elements, their corresponding counts in
74    /// ascending order, and the total count.
75    #[inline]
76    #[must_use]
77    pub fn least_frequent(&self) -> (Vec<(&T, u64)>, u64) {
78        let len = self.data.len();
79        let total_count: u64 = self.data.values().sum();
80        let mut counts = Vec::with_capacity(len);
81
82        for (k, &v) in &self.data {
83            counts.push((k, v));
84        }
85        counts.sort_unstable_by(|&(_, c1), &(_, c2)| c1.cmp(&c2));
86        (counts, total_count)
87    }
88
89    /// Return a `Vec` of elements, their corresponding counts in order
90    /// based on the `least` parameter, and the total count. Uses parallel sort.
91    #[inline]
92    #[must_use]
93    pub fn par_frequent(&self, least: bool) -> (Vec<(&T, u64)>, u64)
94    where
95        for<'a> (&'a T, u64): Send,
96        T: Ord,
97    {
98        let len = self.data.len();
99        let total_count: u64 = self.data.values().sum();
100        let mut counts = Vec::with_capacity(len);
101
102        for (k, &v) in &self.data {
103            counts.push((k, v));
104        }
105        // sort by counts asc/desc
106        // if counts are equal, sort by values lexicographically
107        // We need to do this because otherwise the values are not guaranteed to be in order for equal counts
108        if least {
109            // return counts in ascending order
110            counts.par_sort_unstable_by(|&(v1, c1), &(v2, c2)| {
111                let cmp = c1.cmp(&c2);
112                if cmp == std::cmp::Ordering::Equal {
113                    v1.cmp(v2)
114                } else {
115                    cmp
116                }
117            });
118        } else {
119            // return counts in descending order
120            counts
121                .par_sort_unstable_by(|&(v1, c1), &(v2, c2)| c2.cmp(&c1).then_with(|| v1.cmp(v2)));
122        }
123        (counts, total_count)
124    }
125
126    /// Returns the cardinality of the data.
127    #[must_use]
128    pub fn len(&self) -> usize {
129        self.data.len()
130    }
131
132    /// Returns true if there is no frequency/cardinality data.
133    #[must_use]
134    pub fn is_empty(&self) -> bool {
135        self.data.is_empty()
136    }
137
138    /// Return an iterator over the unique values of the data.
139    #[must_use]
140    pub fn unique_values(&self) -> UniqueValues<'_, T> {
141        UniqueValues {
142            data_keys: self.data.keys(),
143        }
144    }
145
146    /// Get the top N most frequent items without sorting the entire vector
147    /// This is much faster than `most_frequent()` when you only need a few items
148    #[must_use]
149    pub fn top_n(&self, n: usize) -> Vec<(&T, u64)>
150    where
151        T: Ord,
152    {
153        use std::collections::BinaryHeap;
154
155        // We use a min-heap of size n to keep track of the largest elements
156        let mut heap = BinaryHeap::with_capacity(n + 1);
157
158        for (item, count) in &self.data {
159            // Negate count because BinaryHeap is a max-heap
160            // and we want to remove smallest elements
161            heap.push(std::cmp::Reverse((*count, item)));
162
163            // Keep heap size at n
164            if heap.len() > n {
165                heap.pop();
166            }
167        }
168
169        // Convert to sorted vector
170        heap.into_sorted_vec()
171            .into_iter()
172            .map(|std::cmp::Reverse((count, item))| (item, count))
173            .collect()
174    }
175
176    /// Similar to `top_n` but for least frequent items
177    #[must_use]
178    pub fn bottom_n(&self, n: usize) -> Vec<(&T, u64)>
179    where
180        T: Ord,
181    {
182        use std::collections::BinaryHeap;
183
184        let mut heap = BinaryHeap::with_capacity(n + 1);
185
186        for (item, count) in &self.data {
187            heap.push((*count, item));
188            if heap.len() > n {
189                heap.pop();
190            }
191        }
192
193        heap.into_sorted_vec()
194            .into_iter()
195            .map(|(count, item)| (item, count))
196            .collect()
197    }
198
199    /// Get items with exactly n occurrences
200    #[must_use]
201    pub fn items_with_count(&self, n: u64) -> Vec<&T> {
202        self.data
203            .iter()
204            .filter(|&(_, &count)| count == n)
205            .map(|(item, _)| item)
206            .collect()
207    }
208
209    /// Get the sum of all counts
210    #[must_use]
211    pub fn total_count(&self) -> u64 {
212        self.data.values().sum()
213    }
214
215    /// Check if any item occurs exactly n times
216    #[must_use]
217    pub fn has_count(&self, n: u64) -> bool {
218        self.data.values().any(|&count| count == n)
219    }
220
221    /// Add specialized method for single increment
222    #[inline]
223    pub fn increment_by(&mut self, v: T, count: u64) {
224        match self.data.entry(v) {
225            Entry::Vacant(entry) => {
226                entry.insert(count);
227            }
228            Entry::Occupied(mut entry) => {
229                *entry.get_mut() += count;
230            }
231        }
232    }
233}
234
235impl Frequencies<Vec<u8>> {
236    /// Increment count for a byte slice key, avoiding allocation when key exists.
237    /// Uses borrowed lookup via `get_mut(&[u8])` before falling back to owned insert.
238    /// This works because `Vec<u8>: Borrow<[u8]>`, so HashMap accepts `&[u8]` for lookup.
239    /// For low-cardinality columns (the common case), this eliminates ~99% of allocations.
240    #[inline]
241    pub fn add_borrowed(&mut self, v: &[u8]) {
242        if let Some(count) = self.data.get_mut(v) {
243            *count += 1;
244        } else {
245            self.data.insert(v.to_vec(), 1);
246        }
247    }
248
249    /// Increment by a count for a byte slice key, avoiding allocation when key exists.
250    #[inline]
251    pub fn increment_by_borrowed(&mut self, v: &[u8], count: u64) {
252        if let Some(existing) = self.data.get_mut(v) {
253            *existing += count;
254        } else {
255            self.data.insert(v.to_vec(), count);
256        }
257    }
258}
259
260impl<T: Eq + Hash> Commute for Frequencies<T> {
261    #[inline]
262    fn merge(&mut self, v: Frequencies<T>) {
263        // Reserve additional capacity to avoid reallocations
264        self.data.reserve(v.data.len());
265
266        for (k, v2) in v.data {
267            match self.data.entry(k) {
268                Entry::Vacant(v1) => {
269                    v1.insert(v2);
270                }
271                Entry::Occupied(mut v1) => {
272                    *v1.get_mut() += v2;
273                }
274            }
275        }
276    }
277}
278
279impl<T: Eq + Hash> Default for Frequencies<T> {
280    #[inline]
281    fn default() -> Frequencies<T> {
282        Frequencies {
283            data: HashMap::with_capacity(1_000),
284        }
285    }
286}
287
288impl<T: Eq + Hash> FromIterator<T> for Frequencies<T> {
289    #[inline]
290    fn from_iter<I: IntoIterator<Item = T>>(it: I) -> Frequencies<T> {
291        let mut v = Frequencies::new();
292        v.extend(it);
293        v
294    }
295}
296
297impl<T: Eq + Hash> Extend<T> for Frequencies<T> {
298    #[inline]
299    fn extend<I: IntoIterator<Item = T>>(&mut self, it: I) {
300        let iter = it.into_iter();
301        // Reserve capacity if size hint is available and reliable
302        if let (lower, Some(upper)) = iter.size_hint()
303            && lower == upper
304        {
305            // Exact size known - reserve capacity for new entries
306            // We don't know how many will be new vs existing, so reserve conservatively
307            self.data.reserve(lower.saturating_sub(self.data.len()));
308        }
309        for sample in iter {
310            self.add(sample);
311        }
312    }
313}
314
315/// An iterator over unique values in a frequencies count.
316pub struct UniqueValues<'a, K> {
317    data_keys: Keys<'a, K, u64>,
318}
319
320impl<'a, K> Iterator for UniqueValues<'a, K> {
321    type Item = &'a K;
322    fn next(&mut self) -> Option<Self::Item> {
323        self.data_keys.next()
324    }
325}
326
327#[cfg(test)]
328mod test {
329    use super::Frequencies;
330    use std::iter::FromIterator;
331
332    #[test]
333    fn ranked() {
334        let mut counts = Frequencies::new();
335        counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
336        let (most_count, most_total) = counts.most_frequent();
337        assert_eq!(most_count[0], (&2, 5));
338        assert_eq!(most_total, 11);
339        let (least_count, least_total) = counts.least_frequent();
340        assert_eq!(least_count[0], (&3, 1));
341        assert_eq!(least_total, 11);
342        assert_eq!(
343            counts.least_frequent(),
344            (vec![(&3, 1), (&1, 2), (&4, 3), (&2, 5)], 11)
345        );
346    }
347
348    #[test]
349    fn ranked2() {
350        let mut counts = Frequencies::new();
351        counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
352        let (most_count, most_total) = counts.par_frequent(false);
353        assert_eq!(most_count[0], (&2, 5));
354        assert_eq!(most_total, 11);
355        let (least_count, least_total) = counts.par_frequent(true);
356        assert_eq!(least_count[0], (&3, 1));
357        assert_eq!(least_total, 11);
358    }
359
360    #[test]
361    fn unique_values() {
362        let freqs = Frequencies::from_iter(vec![8, 6, 5, 1, 1, 2, 2, 2, 3, 4, 7, 4, 4]);
363        let mut unique: Vec<isize> = freqs.unique_values().copied().collect();
364        unique.sort_unstable();
365        assert_eq!(unique, vec![1, 2, 3, 4, 5, 6, 7, 8]);
366    }
367
368    #[test]
369    fn test_top_n() {
370        let mut freq = Frequencies::new();
371        freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
372
373        let top_2 = freq.top_n(2);
374        assert_eq!(top_2.len(), 2);
375        assert_eq!(top_2[0], (&4, 4)); // Most frequent
376        assert_eq!(top_2[1], (&1, 3)); // Second most frequent
377
378        let bottom_2 = freq.bottom_n(2);
379        assert_eq!(bottom_2.len(), 2);
380        assert_eq!(bottom_2[0], (&3, 1)); // Least frequent
381        assert_eq!(bottom_2[1], (&2, 2)); // Second least frequent
382    }
383
384    #[test]
385    fn test_count_methods() {
386        let mut freq = Frequencies::new();
387        freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
388
389        // Test total_count()
390        assert_eq!(freq.total_count(), 10);
391
392        // Test has_count()
393        assert!(freq.has_count(3)); // 1 appears 3 times
394        assert!(freq.has_count(4)); // 4 appears 4 times
395        assert!(freq.has_count(1)); // 3 appears 1 time
396        assert!(!freq.has_count(5)); // No element appears 5 times
397
398        // Test items_with_count()
399        let items_with_3 = freq.items_with_count(3);
400        assert_eq!(items_with_3, vec![&1]); // Only 1 appears 3 times
401
402        let items_with_2 = freq.items_with_count(2);
403        assert_eq!(items_with_2, vec![&2]); // Only 2 appears 2 times
404
405        let items_with_1 = freq.items_with_count(1);
406        assert_eq!(items_with_1, vec![&3]); // Only 3 appears 1 time
407
408        let items_with_4 = freq.items_with_count(4);
409        assert_eq!(items_with_4, vec![&4]); // Only 4 appears 4 times
410
411        let items_with_5 = freq.items_with_count(5);
412        assert!(items_with_5.is_empty()); // No elements appear 5 times
413    }
414
415    #[test]
416    fn add_borrowed_inserts_new_key() {
417        let mut freq = Frequencies::<Vec<u8>>::new();
418        freq.add_borrowed(b"hello");
419        assert_eq!(freq.count(&b"hello".to_vec()), 1);
420        assert_eq!(freq.cardinality(), 1);
421    }
422
423    #[test]
424    fn add_borrowed_increments_existing_key() {
425        let mut freq = Frequencies::<Vec<u8>>::new();
426        freq.add_borrowed(b"hello");
427        freq.add_borrowed(b"hello");
428        freq.add_borrowed(b"hello");
429        assert_eq!(freq.count(&b"hello".to_vec()), 3);
430        assert_eq!(freq.cardinality(), 1);
431
432        // Also test increment_by_borrowed
433        freq.increment_by_borrowed(b"world", 5);
434        assert_eq!(freq.count(&b"world".to_vec()), 5);
435        freq.increment_by_borrowed(b"world", 3);
436        assert_eq!(freq.count(&b"world".to_vec()), 8);
437    }
438
439    #[test]
440    fn borrowed_owned_interop_for_same_key() {
441        let mut freq = Frequencies::<Vec<u8>>::new();
442        // Insert via owned add
443        freq.add(b"key".to_vec());
444        // Increment via borrowed add
445        freq.add_borrowed(b"key");
446        freq.increment_by_borrowed(b"key", 3);
447        // All methods should see the same accumulated count
448        assert_eq!(freq.count(&b"key".to_vec()), 5);
449        assert_eq!(freq.cardinality(), 1);
450    }
451}