stats/
frequency.rs

1use foldhash::{HashMap, HashMapExt};
2use std::collections::hash_map::{Entry, Keys};
3use std::hash::Hash;
4
5use rayon::prelude::*;
6
7use crate::Commute;
8/// A commutative data structure for exact frequency counts.
9#[derive(Clone)]
10pub struct Frequencies<T> {
11    data: HashMap<T, u64>,
12}
13
14#[cfg(debug_assertions)]
15impl<T: std::fmt::Debug + Eq + Hash> std::fmt::Debug for Frequencies<T> {
16    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
17        write!(f, "{:?}", self.data)
18    }
19}
20
21impl<T: Eq + Hash> Frequencies<T> {
22    /// Create a new frequency table with no samples.
23    #[must_use]
24    pub fn new() -> Frequencies<T> {
25        Default::default()
26    }
27
28    // Add constructor with configurable capacity
29    #[must_use]
30    pub fn with_capacity(capacity: usize) -> Self {
31        Frequencies {
32            data: HashMap::with_capacity(capacity),
33        }
34    }
35
36    /// Add a value to the frequency table.
37    #[inline]
38    pub fn add(&mut self, v: T) {
39        *self.data.entry(v).or_insert(0) += 1;
40    }
41
42    /// Return the number of occurrences of `v` in the data.
43    #[inline]
44    #[must_use]
45    pub fn count(&self, v: &T) -> u64 {
46        self.data.get(v).copied().unwrap_or(0)
47    }
48
49    /// Return the cardinality (number of unique elements) in the data.
50    #[inline]
51    #[must_use]
52    pub fn cardinality(&self) -> u64 {
53        self.len() as u64
54    }
55
56    /// Return a `Vec` of elements, their corresponding counts in
57    /// descending order, and the total count.
58    #[inline]
59    #[must_use]
60    pub fn most_frequent(&self) -> (Vec<(&T, u64)>, u64) {
61        let len = self.data.len();
62        let total_count: u64 = self.data.values().sum();
63        let mut counts = Vec::with_capacity(len);
64
65        for (k, &v) in &self.data {
66            counts.push((k, v));
67        }
68        counts.sort_unstable_by(|&(_, c1), &(_, c2)| c2.cmp(&c1));
69        (counts, total_count)
70    }
71
72    /// Return a `Vec` of elements, their corresponding counts in
73    /// ascending order, and the total count.
74    #[inline]
75    #[must_use]
76    pub fn least_frequent(&self) -> (Vec<(&T, u64)>, u64) {
77        let len = self.data.len();
78        let total_count: u64 = self.data.values().sum();
79        let mut counts = Vec::with_capacity(len);
80
81        for (k, &v) in &self.data {
82            counts.push((k, v));
83        }
84        counts.sort_unstable_by(|&(_, c1), &(_, c2)| c1.cmp(&c2));
85        (counts, total_count)
86    }
87
88    /// Return a `Vec` of elements, their corresponding counts in order
89    /// based on the `least` parameter, and the total count. Uses parallel sort.
90    #[inline]
91    #[must_use]
92    pub fn par_frequent(&self, least: bool) -> (Vec<(&T, u64)>, u64)
93    where
94        for<'a> (&'a T, u64): Send,
95        T: Ord,
96    {
97        let len = self.data.len();
98        let total_count: u64 = self.data.values().sum();
99        let mut counts = Vec::with_capacity(len);
100
101        for (k, &v) in &self.data {
102            counts.push((k, v));
103        }
104        // sort by counts asc/desc
105        // if counts are equal, sort by values lexicographically
106        // We need to do this because otherwise the values are not guaranteed to be in order for equal counts
107        if least {
108            // return counts in ascending order
109            counts.par_sort_unstable_by(|&(v1, c1), &(v2, c2)| {
110                let cmp = c1.cmp(&c2);
111                if cmp == std::cmp::Ordering::Equal {
112                    v1.cmp(v2)
113                } else {
114                    cmp
115                }
116            });
117        } else {
118            // return counts in descending order
119            counts
120                .par_sort_unstable_by(|&(v1, c1), &(v2, c2)| c2.cmp(&c1).then_with(|| v1.cmp(v2)));
121        }
122        (counts, total_count)
123    }
124
125    /// Returns the cardinality of the data.
126    #[must_use]
127    pub fn len(&self) -> usize {
128        self.data.len()
129    }
130
131    /// Returns true if there is no frequency/cardinality data.
132    #[must_use]
133    pub fn is_empty(&self) -> bool {
134        self.data.is_empty()
135    }
136
137    /// Return an iterator over the unique values of the data.
138    #[must_use]
139    pub fn unique_values(&self) -> UniqueValues<'_, T> {
140        UniqueValues {
141            data_keys: self.data.keys(),
142        }
143    }
144
145    /// Get the top N most frequent items without sorting the entire vector
146    /// This is much faster than `most_frequent()` when you only need a few items
147    #[must_use]
148    pub fn top_n(&self, n: usize) -> Vec<(&T, u64)>
149    where
150        T: Ord,
151    {
152        use std::collections::BinaryHeap;
153
154        // We use a min-heap of size n to keep track of the largest elements
155        let mut heap = BinaryHeap::with_capacity(n + 1);
156
157        for (item, count) in &self.data {
158            // Negate count because BinaryHeap is a max-heap
159            // and we want to remove smallest elements
160            heap.push(std::cmp::Reverse((*count, item)));
161
162            // Keep heap size at n
163            if heap.len() > n {
164                heap.pop();
165            }
166        }
167
168        // Convert to sorted vector
169        heap.into_sorted_vec()
170            .into_iter()
171            .map(|std::cmp::Reverse((count, item))| (item, count))
172            .collect()
173    }
174
175    /// Similar to `top_n` but for least frequent items
176    #[must_use]
177    pub fn bottom_n(&self, n: usize) -> Vec<(&T, u64)>
178    where
179        T: Ord,
180    {
181        use std::collections::BinaryHeap;
182
183        let mut heap = BinaryHeap::with_capacity(n + 1);
184
185        for (item, count) in &self.data {
186            heap.push((*count, item));
187            if heap.len() > n {
188                heap.pop();
189            }
190        }
191
192        heap.into_sorted_vec()
193            .into_iter()
194            .map(|(count, item)| (item, count))
195            .collect()
196    }
197
198    /// Get items with exactly n occurrences
199    #[must_use]
200    pub fn items_with_count(&self, n: u64) -> Vec<&T> {
201        self.data
202            .iter()
203            .filter(|&(_, &count)| count == n)
204            .map(|(item, _)| item)
205            .collect()
206    }
207
208    /// Get the sum of all counts
209    #[must_use]
210    pub fn total_count(&self) -> u64 {
211        self.data.values().sum()
212    }
213
214    /// Check if any item occurs exactly n times
215    #[must_use]
216    pub fn has_count(&self, n: u64) -> bool {
217        self.data.values().any(|&count| count == n)
218    }
219
220    /// Add specialized method for single increment
221    #[inline]
222    pub fn increment_by(&mut self, v: T, count: u64) {
223        match self.data.entry(v) {
224            Entry::Vacant(entry) => {
225                entry.insert(count);
226            }
227            Entry::Occupied(mut entry) => {
228                *entry.get_mut() += count;
229            }
230        }
231    }
232}
233
234impl<T: Eq + Hash> Commute for Frequencies<T> {
235    #[inline]
236    fn merge(&mut self, v: Frequencies<T>) {
237        // Reserve additional capacity to avoid reallocations
238        self.data.reserve(v.data.len());
239
240        for (k, v2) in v.data {
241            match self.data.entry(k) {
242                Entry::Vacant(v1) => {
243                    v1.insert(v2);
244                }
245                Entry::Occupied(mut v1) => {
246                    *v1.get_mut() += v2;
247                }
248            }
249        }
250    }
251}
252
253impl<T: Eq + Hash> Default for Frequencies<T> {
254    #[inline]
255    fn default() -> Frequencies<T> {
256        Frequencies {
257            data: HashMap::with_capacity(1_000),
258        }
259    }
260}
261
262impl<T: Eq + Hash> FromIterator<T> for Frequencies<T> {
263    #[inline]
264    fn from_iter<I: IntoIterator<Item = T>>(it: I) -> Frequencies<T> {
265        let mut v = Frequencies::new();
266        v.extend(it);
267        v
268    }
269}
270
271impl<T: Eq + Hash> Extend<T> for Frequencies<T> {
272    #[inline]
273    fn extend<I: IntoIterator<Item = T>>(&mut self, it: I) {
274        let iter = it.into_iter();
275        // Reserve capacity if size hint is available and reliable
276        if let (lower, Some(upper)) = iter.size_hint()
277            && lower == upper
278        {
279            // Exact size known - reserve capacity for new entries
280            // We don't know how many will be new vs existing, so reserve conservatively
281            self.data.reserve(lower.saturating_sub(self.data.len()));
282        }
283        for sample in iter {
284            self.add(sample);
285        }
286    }
287}
288
289/// An iterator over unique values in a frequencies count.
290pub struct UniqueValues<'a, K> {
291    data_keys: Keys<'a, K, u64>,
292}
293
294impl<'a, K> Iterator for UniqueValues<'a, K> {
295    type Item = &'a K;
296    fn next(&mut self) -> Option<Self::Item> {
297        self.data_keys.next()
298    }
299}
300
301#[cfg(test)]
302mod test {
303    use super::Frequencies;
304    use std::iter::FromIterator;
305
306    #[test]
307    fn ranked() {
308        let mut counts = Frequencies::new();
309        counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
310        let (most_count, most_total) = counts.most_frequent();
311        assert_eq!(most_count[0], (&2, 5));
312        assert_eq!(most_total, 11);
313        let (least_count, least_total) = counts.least_frequent();
314        assert_eq!(least_count[0], (&3, 1));
315        assert_eq!(least_total, 11);
316        assert_eq!(
317            counts.least_frequent(),
318            (vec![(&3, 1), (&1, 2), (&4, 3), (&2, 5)], 11)
319        );
320    }
321
322    #[test]
323    fn ranked2() {
324        let mut counts = Frequencies::new();
325        counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
326        let (most_count, most_total) = counts.par_frequent(false);
327        assert_eq!(most_count[0], (&2, 5));
328        assert_eq!(most_total, 11);
329        let (least_count, least_total) = counts.par_frequent(true);
330        assert_eq!(least_count[0], (&3, 1));
331        assert_eq!(least_total, 11);
332    }
333
334    #[test]
335    fn unique_values() {
336        let freqs = Frequencies::from_iter(vec![8, 6, 5, 1, 1, 2, 2, 2, 3, 4, 7, 4, 4]);
337        let mut unique: Vec<isize> = freqs.unique_values().copied().collect();
338        unique.sort_unstable();
339        assert_eq!(unique, vec![1, 2, 3, 4, 5, 6, 7, 8]);
340    }
341
342    #[test]
343    fn test_top_n() {
344        let mut freq = Frequencies::new();
345        freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
346
347        let top_2 = freq.top_n(2);
348        assert_eq!(top_2.len(), 2);
349        assert_eq!(top_2[0], (&4, 4)); // Most frequent
350        assert_eq!(top_2[1], (&1, 3)); // Second most frequent
351
352        let bottom_2 = freq.bottom_n(2);
353        assert_eq!(bottom_2.len(), 2);
354        assert_eq!(bottom_2[0], (&3, 1)); // Least frequent
355        assert_eq!(bottom_2[1], (&2, 2)); // Second least frequent
356    }
357
358    #[test]
359    fn test_count_methods() {
360        let mut freq = Frequencies::new();
361        freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
362
363        // Test total_count()
364        assert_eq!(freq.total_count(), 10);
365
366        // Test has_count()
367        assert!(freq.has_count(3)); // 1 appears 3 times
368        assert!(freq.has_count(4)); // 4 appears 4 times
369        assert!(freq.has_count(1)); // 3 appears 1 time
370        assert!(!freq.has_count(5)); // No element appears 5 times
371
372        // Test items_with_count()
373        let items_with_3 = freq.items_with_count(3);
374        assert_eq!(items_with_3, vec![&1]); // Only 1 appears 3 times
375
376        let items_with_2 = freq.items_with_count(2);
377        assert_eq!(items_with_2, vec![&2]); // Only 2 appears 2 times
378
379        let items_with_1 = freq.items_with_count(1);
380        assert_eq!(items_with_1, vec![&3]); // Only 3 appears 1 time
381
382        let items_with_4 = freq.items_with_count(4);
383        assert_eq!(items_with_4, vec![&4]); // Only 4 appears 4 times
384
385        let items_with_5 = freq.items_with_count(5);
386        assert!(items_with_5.is_empty()); // No elements appear 5 times
387    }
388}