stats/
frequency.rs

1use foldhash::{HashMap, HashMapExt};
2use std::collections::hash_map::{Entry, Keys};
3use std::hash::Hash;
4
5use rayon::prelude::*;
6
7use crate::Commute;
8/// A commutative data structure for exact frequency counts.
9#[derive(Clone)]
10pub struct Frequencies<T> {
11    data: HashMap<T, u64>,
12}
13
14#[cfg(debug_assertions)]
15impl<T: std::fmt::Debug + Eq + Hash> std::fmt::Debug for Frequencies<T> {
16    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
17        write!(f, "{:?}", self.data)
18    }
19}
20
21impl<T: Eq + Hash> Frequencies<T> {
22    /// Create a new frequency table with no samples.
23    #[must_use]
24    pub fn new() -> Frequencies<T> {
25        Default::default()
26    }
27
28    // Add constructor with configurable capacity
29    #[must_use]
30    pub fn with_capacity(capacity: usize) -> Self {
31        Frequencies {
32            data: HashMap::with_capacity(capacity),
33        }
34    }
35
36    /// Add a value to the frequency table.
37    #[inline]
38    pub fn add(&mut self, v: T) {
39        *self.data.entry(v).or_insert(0) += 1;
40    }
41
42    /// Return the number of occurrences of `v` in the data.
43    #[inline]
44    #[must_use]
45    pub fn count(&self, v: &T) -> u64 {
46        self.data.get(v).copied().unwrap_or(0)
47    }
48
49    /// Return the cardinality (number of unique elements) in the data.
50    #[inline]
51    #[must_use]
52    pub fn cardinality(&self) -> u64 {
53        self.len() as u64
54    }
55
56    /// Returns the mode if one exists.
57    /// Note that there is also a `modes()` function that returns all
58    /// modes (plural) in `unsorted::modes()`. It returns all modes
59    /// with the same frequency.
60    #[inline]
61    #[must_use]
62    pub fn mode(&self) -> Option<&T> {
63        let (counts, _) = self.most_frequent();
64        if counts.is_empty() {
65            return None;
66        }
67        // If there is a tie for the most frequent element, return None.
68        if counts.len() >= 2 && counts[0].1 == counts[1].1 {
69            None
70        } else {
71            Some(counts[0].0)
72        }
73    }
74
75    /// Return a `Vec` of elements, their corresponding counts in
76    /// descending order, and the total count.
77    #[inline]
78    #[must_use]
79    pub fn most_frequent(&self) -> (Vec<(&T, u64)>, u64) {
80        let len = self.data.len();
81        let mut counts = Vec::with_capacity(len);
82        let mut total_count = 0_u64;
83
84        for (k, &v) in &self.data {
85            total_count += v;
86            counts.push((k, v));
87        }
88        counts.sort_unstable_by(|&(_, c1), &(_, c2)| c2.cmp(&c1));
89        (counts, total_count)
90    }
91
92    /// Return a `Vec` of elements, their corresponding counts in
93    /// ascending order, and the total count.
94    #[inline]
95    #[must_use]
96    pub fn least_frequent(&self) -> (Vec<(&T, u64)>, u64) {
97        let mut total_count = 0_u64;
98        let mut counts: Vec<_> = self
99            .data
100            .iter()
101            .map(|(k, &v)| {
102                total_count += v;
103                (k, v)
104            })
105            .collect();
106        counts.sort_unstable_by(|&(_, c1), &(_, c2)| c1.cmp(&c2));
107        (counts, total_count)
108    }
109
110    /// Return a `Vec` of elements, their corresponding counts in order
111    /// based on the `least` parameter, and the total count. Uses parallel sort.
112    #[inline]
113    #[must_use]
114    pub fn par_frequent(&self, least: bool) -> (Vec<(&T, u64)>, u64)
115    where
116        for<'a> (&'a T, u64): Send,
117        T: Ord,
118    {
119        let mut total_count = 0_u64;
120        let mut counts: Vec<_> = self
121            .data
122            .iter()
123            .map(|(k, &v)| {
124                total_count += v;
125                (k, v)
126            })
127            .collect();
128        // sort by counts asc/desc
129        // if counts are equal, sort by values lexicographically
130        // We need to do this because otherwise the values are not guaranteed to be in order for equal counts
131        if least {
132            // return counts in ascending order
133            counts.par_sort_unstable_by(|&(v1, c1), &(v2, c2)| {
134                let cmp = c1.cmp(&c2);
135                if cmp == std::cmp::Ordering::Equal {
136                    v1.cmp(v2)
137                } else {
138                    cmp
139                }
140            });
141        } else {
142            // return counts in descending order
143            counts
144                .par_sort_unstable_by(|&(v1, c1), &(v2, c2)| c2.cmp(&c1).then_with(|| v1.cmp(v2)));
145        }
146        (counts, total_count)
147    }
148
149    /// Returns the cardinality of the data.
150    #[must_use]
151    pub fn len(&self) -> usize {
152        self.data.len()
153    }
154
155    /// Returns true if there is no frequency/cardinality data.
156    #[must_use]
157    pub fn is_empty(&self) -> bool {
158        self.data.is_empty()
159    }
160
161    /// Return an iterator over the unique values of the data.
162    #[must_use]
163    pub fn unique_values(&self) -> UniqueValues<'_, T> {
164        UniqueValues {
165            data_keys: self.data.keys(),
166        }
167    }
168
169    /// Get the top N most frequent items without sorting the entire vector
170    /// This is much faster than `most_frequent()` when you only need a few items
171    #[must_use]
172    pub fn top_n(&self, n: usize) -> Vec<(&T, u64)>
173    where
174        T: Ord,
175    {
176        use std::collections::BinaryHeap;
177
178        // We use a min-heap of size n to keep track of the largest elements
179        let mut heap = BinaryHeap::with_capacity(n + 1);
180
181        for (item, count) in &self.data {
182            // Negate count because BinaryHeap is a max-heap
183            // and we want to remove smallest elements
184            heap.push(std::cmp::Reverse((*count, item)));
185
186            // Keep heap size at n
187            if heap.len() > n {
188                heap.pop();
189            }
190        }
191
192        // Convert to sorted vector
193        heap.into_sorted_vec()
194            .into_iter()
195            .map(|std::cmp::Reverse((count, item))| (item, count))
196            .collect()
197    }
198
199    /// Similar to `top_n` but for least frequent items
200    #[must_use]
201    pub fn bottom_n(&self, n: usize) -> Vec<(&T, u64)>
202    where
203        T: Ord,
204    {
205        use std::collections::BinaryHeap;
206
207        let mut heap = BinaryHeap::with_capacity(n + 1);
208
209        for (item, count) in &self.data {
210            heap.push((*count, item));
211            if heap.len() > n {
212                heap.pop();
213            }
214        }
215
216        heap.into_sorted_vec()
217            .into_iter()
218            .map(|(count, item)| (item, count))
219            .collect()
220    }
221
222    /// Get items with exactly n occurrences
223    #[must_use]
224    pub fn items_with_count(&self, n: u64) -> Vec<&T> {
225        self.data
226            .iter()
227            .filter(|&(_, &count)| count == n)
228            .map(|(item, _)| item)
229            .collect()
230    }
231
232    /// Get the sum of all counts
233    #[must_use]
234    pub fn total_count(&self) -> u64 {
235        self.data.values().sum()
236    }
237
238    /// Check if any item occurs exactly n times
239    #[must_use]
240    pub fn has_count(&self, n: u64) -> bool {
241        self.data.values().any(|&count| count == n)
242    }
243
244    /// Add specialized method for single increment
245    #[inline]
246    pub fn increment_by(&mut self, v: T, count: u64) {
247        match self.data.entry(v) {
248            Entry::Vacant(entry) => {
249                entry.insert(count);
250            }
251            Entry::Occupied(mut entry) => {
252                *entry.get_mut() += count;
253            }
254        }
255    }
256}
257
258impl<T: Eq + Hash> Commute for Frequencies<T> {
259    #[inline]
260    fn merge(&mut self, v: Frequencies<T>) {
261        // Reserve additional capacity to avoid reallocations
262        self.data.reserve(v.data.len());
263
264        for (k, v2) in v.data {
265            match self.data.entry(k) {
266                Entry::Vacant(v1) => {
267                    v1.insert(v2);
268                }
269                Entry::Occupied(mut v1) => {
270                    *v1.get_mut() += v2;
271                }
272            }
273        }
274    }
275}
276
277impl<T: Eq + Hash> Default for Frequencies<T> {
278    #[inline]
279    fn default() -> Frequencies<T> {
280        Frequencies {
281            data: HashMap::with_capacity(1_000),
282        }
283    }
284}
285
286impl<T: Eq + Hash> FromIterator<T> for Frequencies<T> {
287    #[inline]
288    fn from_iter<I: IntoIterator<Item = T>>(it: I) -> Frequencies<T> {
289        let mut v = Frequencies::new();
290        v.extend(it);
291        v
292    }
293}
294
295impl<T: Eq + Hash> Extend<T> for Frequencies<T> {
296    #[inline]
297    fn extend<I: IntoIterator<Item = T>>(&mut self, it: I) {
298        for sample in it {
299            self.add(sample);
300        }
301    }
302}
303
304/// An iterator over unique values in a frequencies count.
305pub struct UniqueValues<'a, K> {
306    data_keys: Keys<'a, K, u64>,
307}
308
309impl<'a, K> Iterator for UniqueValues<'a, K> {
310    type Item = &'a K;
311    fn next(&mut self) -> Option<Self::Item> {
312        self.data_keys.next()
313    }
314}
315
316#[cfg(test)]
317mod test {
318    use super::Frequencies;
319    use std::iter::FromIterator;
320
321    #[test]
322    fn ranked() {
323        let mut counts = Frequencies::new();
324        counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
325        let (most_count, most_total) = counts.most_frequent();
326        assert_eq!(most_count[0], (&2, 5));
327        assert_eq!(most_total, 11);
328        let (least_count, least_total) = counts.least_frequent();
329        assert_eq!(least_count[0], (&3, 1));
330        assert_eq!(least_total, 11);
331        assert_eq!(
332            counts.least_frequent(),
333            (vec![(&3, 1), (&1, 2), (&4, 3), (&2, 5)], 11)
334        );
335    }
336
337    #[test]
338    fn ranked2() {
339        let mut counts = Frequencies::new();
340        counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
341        let (most_count, most_total) = counts.par_frequent(false);
342        assert_eq!(most_count[0], (&2, 5));
343        assert_eq!(most_total, 11);
344        let (least_count, least_total) = counts.par_frequent(true);
345        assert_eq!(least_count[0], (&3, 1));
346        assert_eq!(least_total, 11);
347    }
348
349    #[test]
350    fn unique_values() {
351        let freqs = Frequencies::from_iter(vec![8, 6, 5, 1, 1, 2, 2, 2, 3, 4, 7, 4, 4]);
352        let mut unique: Vec<isize> = freqs.unique_values().copied().collect();
353        unique.sort_unstable();
354        assert_eq!(unique, vec![1, 2, 3, 4, 5, 6, 7, 8]);
355    }
356
357    #[test]
358    fn test_top_n() {
359        let mut freq = Frequencies::new();
360        freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
361
362        let top_2 = freq.top_n(2);
363        assert_eq!(top_2.len(), 2);
364        assert_eq!(top_2[0], (&4, 4)); // Most frequent
365        assert_eq!(top_2[1], (&1, 3)); // Second most frequent
366
367        let bottom_2 = freq.bottom_n(2);
368        assert_eq!(bottom_2.len(), 2);
369        assert_eq!(bottom_2[0], (&3, 1)); // Least frequent
370        assert_eq!(bottom_2[1], (&2, 2)); // Second least frequent
371    }
372
373    #[test]
374    fn test_count_methods() {
375        let mut freq = Frequencies::new();
376        freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
377
378        // Test total_count()
379        assert_eq!(freq.total_count(), 10);
380
381        // Test has_count()
382        assert!(freq.has_count(3)); // 1 appears 3 times
383        assert!(freq.has_count(4)); // 4 appears 4 times
384        assert!(freq.has_count(1)); // 3 appears 1 time
385        assert!(!freq.has_count(5)); // No element appears 5 times
386
387        // Test items_with_count()
388        let items_with_3 = freq.items_with_count(3);
389        assert_eq!(items_with_3, vec![&1]); // Only 1 appears 3 times
390
391        let items_with_2 = freq.items_with_count(2);
392        assert_eq!(items_with_2, vec![&2]); // Only 2 appears 2 times
393
394        let items_with_1 = freq.items_with_count(1);
395        assert_eq!(items_with_1, vec![&3]); // Only 3 appears 1 time
396
397        let items_with_4 = freq.items_with_count(4);
398        assert_eq!(items_with_4, vec![&4]); // Only 4 appears 4 times
399
400        let items_with_5 = freq.items_with_count(5);
401        assert!(items_with_5.is_empty()); // No elements appear 5 times
402    }
403}