stats/
frequency.rs

1use std::hash::Hash;
2
3use hashbrown::HashMap;
4use hashbrown::hash_map::{Entry, EntryRef, Keys};
5
6use rayon::prelude::*;
7use serde::{Deserialize, Serialize};
8
9use crate::Commute;
10
11const PARALLEL_THRESHOLD: usize = 10_000;
12/// A commutative data structure for exact frequency counts.
13#[derive(Clone, Serialize, Deserialize)]
14#[serde(bound(
15    serialize = "T: Serialize + Eq + Hash",
16    deserialize = "T: Deserialize<'de> + Eq + Hash"
17))]
18pub struct Frequencies<T> {
19    data: HashMap<T, u64>,
20}
21
22// Manual impl: the derive would bound on `T: PartialEq` only, but
23// `HashMap: PartialEq` requires `T: Eq + Hash`.
24impl<T: Eq + Hash> PartialEq for Frequencies<T> {
25    fn eq(&self, other: &Self) -> bool {
26        self.data == other.data
27    }
28}
29
30#[cfg(debug_assertions)]
31impl<T: std::fmt::Debug + Eq + Hash> std::fmt::Debug for Frequencies<T> {
32    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
33        write!(f, "{:?}", self.data)
34    }
35}
36
37impl<T: Eq + Hash> Frequencies<T> {
38    /// Create a new frequency table with no samples.
39    #[must_use]
40    pub fn new() -> Frequencies<T> {
41        Default::default()
42    }
43
44    // Add constructor with configurable capacity
45    #[must_use]
46    pub fn with_capacity(capacity: usize) -> Self {
47        Frequencies {
48            data: HashMap::with_capacity(capacity),
49        }
50    }
51
52    /// Add a value to the frequency table.
53    #[allow(clippy::inline_always)]
54    #[inline(always)]
55    pub fn add(&mut self, v: T) {
56        *self.data.entry(v).or_insert(0) += 1;
57    }
58
59    /// Return the number of occurrences of `v` in the data.
60    #[inline]
61    #[must_use]
62    pub fn count(&self, v: &T) -> u64 {
63        self.data.get(v).copied().unwrap_or(0)
64    }
65
66    /// Return the cardinality (number of unique elements) in the data.
67    #[inline]
68    #[must_use]
69    pub fn cardinality(&self) -> u64 {
70        self.len() as u64
71    }
72
73    /// Collect counts and total in a single pass, reused by `most/least_frequent`.
74    fn collect_counts(&self) -> (Vec<(&T, u64)>, u64) {
75        let mut total_count = 0u64;
76        let counts: Vec<(&T, u64)> = self
77            .data
78            .iter()
79            .map(|(k, &v)| {
80                total_count += v;
81                (k, v)
82            })
83            .collect();
84        (counts, total_count)
85    }
86
87    /// Return a `Vec` of elements, their corresponding counts in
88    /// descending order, and the total count.
89    #[inline]
90    #[must_use]
91    pub fn most_frequent(&self) -> (Vec<(&T, u64)>, u64) {
92        let (mut counts, total_count) = self.collect_counts();
93        counts.sort_unstable_by_key(|&(_, c)| std::cmp::Reverse(c));
94        (counts, total_count)
95    }
96
97    /// Return a `Vec` of elements, their corresponding counts in
98    /// ascending order, and the total count.
99    #[inline]
100    #[must_use]
101    pub fn least_frequent(&self) -> (Vec<(&T, u64)>, u64) {
102        let (mut counts, total_count) = self.collect_counts();
103        counts.sort_unstable_by_key(|&(_, c)| c);
104        (counts, total_count)
105    }
106
107    /// Return a `Vec` of elements, their corresponding counts in order
108    /// based on the `least` parameter, and the total count. Uses parallel sort.
109    #[inline]
110    #[must_use]
111    pub fn par_frequent(&self, least: bool) -> (Vec<(&T, u64)>, u64)
112    where
113        for<'a> (&'a T, u64): Send,
114        T: Ord,
115    {
116        let (mut counts, total_count) = self.collect_counts();
117        // sort by counts asc/desc
118        // if counts are equal, sort by values lexicographically
119        // We need to do this because otherwise the values are not guaranteed to be in order for
120        // equal counts
121        if least {
122            // return counts in ascending order
123            let sort_fn =
124                |&(v1, c1): &(&T, u64), &(v2, c2): &(&T, u64)| c1.cmp(&c2).then_with(|| v1.cmp(v2));
125            if counts.len() < PARALLEL_THRESHOLD {
126                counts.sort_unstable_by(sort_fn);
127            } else {
128                counts.par_sort_unstable_by(sort_fn);
129            }
130        } else {
131            // return counts in descending order
132            let sort_fn =
133                |&(v1, c1): &(&T, u64), &(v2, c2): &(&T, u64)| c2.cmp(&c1).then_with(|| v1.cmp(v2));
134            if counts.len() < PARALLEL_THRESHOLD {
135                counts.sort_unstable_by(sort_fn);
136            } else {
137                counts.par_sort_unstable_by(sort_fn);
138            }
139        }
140        (counts, total_count)
141    }
142
143    /// Returns the cardinality of the data.
144    #[must_use]
145    pub fn len(&self) -> usize {
146        self.data.len()
147    }
148
149    /// Returns true if there is no frequency/cardinality data.
150    #[must_use]
151    pub fn is_empty(&self) -> bool {
152        self.data.is_empty()
153    }
154
155    /// Return an iterator over the unique values of the data.
156    #[must_use]
157    pub fn unique_values(&self) -> UniqueValues<'_, T> {
158        UniqueValues {
159            data_keys: self.data.keys(),
160        }
161    }
162
163    /// Get the top N most frequent items without sorting the entire vector
164    /// This is much faster than `most_frequent()` when you only need a few items
165    #[must_use]
166    pub fn top_n(&self, n: usize) -> Vec<(&T, u64)>
167    where
168        T: Ord,
169    {
170        use std::collections::BinaryHeap;
171
172        // Ties are broken by value *ascending* so the result matches
173        // `most_frequent()` / `par_frequent(false)` (count desc, value asc).
174        // We achieve this by ranking each entry on `(count, Reverse(item))`:
175        // a larger count wins, and on a count tie the lexicographically smaller
176        // value wins (its `Reverse(item)` is larger).
177        //
178        // Min-heap (via the outer Reverse) of the n best entries seen so far.
179        // peek() returns the worst entry currently in the top-N — replace it
180        // only when a strictly better candidate comes in. Avoids a push+pop per
181        // rejected element on high-cardinality inputs with small n.
182        let mut heap = BinaryHeap::with_capacity(n);
183
184        for (item, count) in &self.data {
185            let candidate = (*count, std::cmp::Reverse(item));
186            if heap.len() < n {
187                heap.push(std::cmp::Reverse(candidate));
188            } else if let Some(top) = heap.peek()
189                && candidate > top.0
190            {
191                heap.pop();
192                heap.push(std::cmp::Reverse(candidate));
193            }
194        }
195
196        // Convert to sorted vector (count desc, value asc)
197        heap.into_sorted_vec()
198            .into_iter()
199            .map(|std::cmp::Reverse((count, std::cmp::Reverse(item)))| (item, count))
200            .collect()
201    }
202
203    /// Similar to `top_n` but for least frequent items
204    #[must_use]
205    pub fn bottom_n(&self, n: usize) -> Vec<(&T, u64)>
206    where
207        T: Ord,
208    {
209        use std::collections::BinaryHeap;
210
211        // Max-heap of the n smallest elements seen so far. Mirror of top_n.
212        let mut heap = BinaryHeap::with_capacity(n);
213
214        for (item, count) in &self.data {
215            let candidate = (*count, item);
216            if heap.len() < n {
217                heap.push(candidate);
218            } else if let Some(top) = heap.peek()
219                && candidate < *top
220            {
221                heap.pop();
222                heap.push(candidate);
223            }
224        }
225
226        heap.into_sorted_vec()
227            .into_iter()
228            .map(|(count, item)| (item, count))
229            .collect()
230    }
231
232    /// Get items with exactly n occurrences
233    #[must_use]
234    pub fn items_with_count(&self, n: u64) -> Vec<&T> {
235        self.data
236            .iter()
237            .filter(|&(_, &count)| count == n)
238            .map(|(item, _)| item)
239            .collect()
240    }
241
242    /// Get the sum of all counts
243    #[must_use]
244    pub fn total_count(&self) -> u64 {
245        self.data.values().sum()
246    }
247
248    /// Check if any item occurs exactly n times
249    #[must_use]
250    pub fn has_count(&self, n: u64) -> bool {
251        self.data.values().any(|&count| count == n)
252    }
253
254    /// Add specialized method for single increment
255    #[allow(clippy::inline_always)]
256    #[inline(always)]
257    pub fn increment_by(&mut self, v: T, count: u64) {
258        match self.data.entry(v) {
259            Entry::Vacant(entry) => {
260                entry.insert(count);
261            }
262            Entry::Occupied(mut entry) => {
263                *entry.get_mut() += count;
264            }
265        }
266    }
267}
268
269impl<T: Eq + Hash + Ord + Clone + Send + Sync> Frequencies<T> {
270    /// Returns the modes and antimodes of the data.
271    ///
272    /// Produces results identical to [`crate::Unsorted::modes_antimodes`] for the
273    /// same multiset of samples with per-value counts <= `u32::MAX` (verified by
274    /// the `modes_antimodes_matches_unsorted` property test and the equivalence
275    /// assertion in `benches/modesfreq.rs`). Above that the two diverge:
276    /// selection here is exact via full `u64` counts, while `Unsorted` tracks
277    /// `u32` run counts (and cannot practically hold that many samples anyway).
278    ///
279    /// Rather than sorting all unique values, this only sorts what the output
280    /// actually contains: one O(c) pass over the counts finds the
281    /// highest/lowest occurrence counts, a second pass collects only the
282    /// matching keys, then the (typically tiny) mode set is sorted and the 10
283    /// smallest antimodes are picked via `select_nth_unstable` - O(c) average
284    /// instead of O(c log c), where c is the cardinality. Uniform-count data
285    /// (`highest == lowest`: every value is both a mode and an antimode) falls
286    /// back to a single full key sort, which beats collecting the keys twice.
287    ///
288    /// Counts are compared as `u64`, so mode/antimode *selection* is exact even
289    /// when occurrence counts exceed `u32::MAX`; only the returned occurrence
290    /// counts saturate at `u32::MAX`.
291    ///
292    /// Returns `((modes, modes_count, mode_occurrences),
293    /// (antimodes, antimodes_count, antimode_occurrences))`.
294    /// Only the first 10 antimodes are returned.
295    #[allow(clippy::type_complexity)]
296    #[must_use]
297    pub fn modes_antimodes(&self) -> ((Vec<T>, usize, u32), (Vec<T>, usize, u32)) {
298        if self.data.is_empty() {
299            core::hint::cold_path();
300            return ((Vec::new(), 0, 0), (Vec::new(), 0, 0));
301        }
302
303        // full-width comparison: distinct counts above u32::MAX must not
304        // collapse into ties; saturation happens only at the return boundary
305        let mut highest_count = 1_u64;
306        let mut lowest_count = u64::MAX;
307        for &c in self.data.values() {
308            highest_count = highest_count.max(c);
309            lowest_count = lowest_count.min(c);
310        }
311        let highest_count_u32 = u32::try_from(highest_count).unwrap_or(u32::MAX);
312        let lowest_count_u32 = u32::try_from(lowest_count).unwrap_or(u32::MAX);
313
314        // single unique value: it is the mode, antimodes are empty
315        if self.data.len() == 1 {
316            let k = self.data.keys().next().unwrap();
317            return ((vec![k.clone()], 1, lowest_count_u32), (Vec::new(), 0, 0));
318        }
319
320        // all values unique: modes are empty, the 10 smallest values are the
321        // antimodes - selection instead of sorting all c keys
322        if highest_count == 1 {
323            let total = self.data.len();
324            let mut keys: Vec<&T> = self.data.keys().collect();
325            if keys.len() > 10 {
326                keys.select_nth_unstable(9);
327                keys.truncate(10);
328            }
329            keys.sort_unstable();
330            let antimodes: Vec<T> = keys.into_iter().cloned().collect();
331            return ((Vec::new(), 0, 0), (antimodes, total, 1));
332        }
333
334        // uniform counts: every value is both a mode and an antimode, so all
335        // keys must be sorted anyway - one full sort, no second collect
336        if highest_count == lowest_count {
337            let mut keys: Vec<&T> = self.data.keys().collect();
338            if keys.len() > PARALLEL_THRESHOLD {
339                keys.par_sort_unstable();
340            } else {
341                keys.sort_unstable();
342            }
343            let total = keys.len();
344            let antimodes: Vec<T> = keys.iter().take(10).map(|k| (*k).clone()).collect();
345            let modes: Vec<T> = keys.into_iter().cloned().collect();
346            return (
347                (modes, total, highest_count_u32),
348                (antimodes, total, lowest_count_u32),
349            );
350        }
351
352        // general case: collect only the keys that appear in the output
353        // (highest != lowest here, so a key matches at most one bucket)
354        let mut modes: Vec<&T> = Vec::new();
355        let mut antimodes: Vec<&T> = Vec::new();
356        for (k, &c) in &self.data {
357            if c == highest_count {
358                modes.push(k);
359            } else if c == lowest_count {
360                antimodes.push(k);
361            }
362        }
363
364        if modes.len() > PARALLEL_THRESHOLD {
365            modes.par_sort_unstable();
366        } else {
367            modes.sort_unstable();
368        }
369        let antimodes_total = antimodes.len();
370        if antimodes_total > 10 {
371            antimodes.select_nth_unstable(9);
372            antimodes.truncate(10);
373        }
374        antimodes.sort_unstable();
375
376        (
377            (
378                modes.iter().map(|k| (*k).clone()).collect(),
379                modes.len(),
380                highest_count_u32,
381            ),
382            (
383                antimodes.into_iter().cloned().collect(),
384                antimodes_total,
385                lowest_count_u32,
386            ),
387        )
388    }
389}
390
391impl Frequencies<Vec<u8>> {
392    /// Increment count for a byte slice key, avoiding allocation when key exists.
393    /// Uses hashbrown's `entry_ref(&[u8])`, which probes once with the borrowed
394    /// key and only allocates (`[u8]::to_owned()` -> `Vec<u8>`) on the vacant
395    /// branch. For low-cardinality columns (the common case), this eliminates
396    /// ~99% of allocations; for new keys it is a single hash+probe (std's
397    /// HashMap has no stable raw-entry API, so the old path hashed twice).
398    #[allow(clippy::inline_always)]
399    #[inline(always)]
400    pub fn add_borrowed(&mut self, v: &[u8]) {
401        *self.data.entry_ref(v).or_insert(0) += 1;
402    }
403
404    /// Increment by a count for a byte slice key, avoiding allocation when key exists.
405    #[allow(clippy::inline_always)]
406    #[inline(always)]
407    pub fn increment_by_borrowed(&mut self, v: &[u8], count: u64) {
408        *self.data.entry_ref(v).or_insert(0) += count;
409    }
410
411    /// Increment the count for `v`, enforcing a cardinality cap.
412    ///
413    /// Existing keys always increment (the map doesn't grow). A NEW key that
414    /// would grow the map past `cap` unique entries is rejected: the map is
415    /// left unchanged and `false` is returned, so the caller can drop the
416    /// tracker. `cap == 0` means unbounded.
417    ///
418    /// Like [`Self::add_borrowed`], this single-probes via `entry_ref` and
419    /// only allocates an owned key on the (admitted) vacant branch.
420    #[allow(clippy::inline_always)]
421    #[inline(always)]
422    pub fn add_borrowed_capped(&mut self, v: &[u8], cap: u64) -> bool {
423        let len = self.data.len() as u64;
424        match self.data.entry_ref(v) {
425            EntryRef::Occupied(mut e) => {
426                *e.get_mut() += 1;
427                true
428            }
429            EntryRef::Vacant(e) => {
430                if cap > 0 && len >= cap {
431                    false
432                } else {
433                    e.insert(1);
434                    true
435                }
436            }
437        }
438    }
439}
440
441impl<T: Eq + Hash> Commute for Frequencies<T> {
442    #[inline]
443    fn merge(&mut self, v: Frequencies<T>) {
444        // Reserve additional capacity to avoid reallocations
445        self.data.reserve(v.data.len());
446
447        for (k, v2) in v.data {
448            match self.data.entry(k) {
449                Entry::Vacant(v1) => {
450                    v1.insert(v2);
451                }
452                Entry::Occupied(mut v1) => {
453                    *v1.get_mut() += v2;
454                }
455            }
456        }
457    }
458}
459
460impl<T: Eq + Hash> Default for Frequencies<T> {
461    #[inline]
462    fn default() -> Frequencies<T> {
463        Frequencies {
464            data: HashMap::with_capacity(64),
465        }
466    }
467}
468
469impl<T: Eq + Hash> FromIterator<T> for Frequencies<T> {
470    #[inline]
471    fn from_iter<I: IntoIterator<Item = T>>(it: I) -> Frequencies<T> {
472        let mut v = Frequencies::new();
473        v.extend(it);
474        v
475    }
476}
477
478impl<T: Eq + Hash> Extend<T> for Frequencies<T> {
479    #[inline]
480    fn extend<I: IntoIterator<Item = T>>(&mut self, it: I) {
481        let iter = it.into_iter();
482        // Reserve capacity if size hint is available and reliable
483        if let (lower, Some(upper)) = iter.size_hint()
484            && lower == upper
485        {
486            // Exact size known - reserve capacity for new entries
487            // We don't know how many will be new vs existing, so reserve conservatively
488            self.data.reserve(lower.saturating_sub(self.data.len()));
489        }
490        for sample in iter {
491            self.add(sample);
492        }
493    }
494}
495
496/// An iterator over unique values in a frequencies count.
497pub struct UniqueValues<'a, K> {
498    data_keys: Keys<'a, K, u64>,
499}
500
501impl<'a, K> Iterator for UniqueValues<'a, K> {
502    type Item = &'a K;
503    #[inline]
504    fn next(&mut self) -> Option<Self::Item> {
505        self.data_keys.next()
506    }
507
508    // Forward the exact size hint from the underlying `Keys` iterator so that
509    // `collect`/`extend` can preallocate exactly instead of reallocating.
510    #[inline]
511    fn size_hint(&self) -> (usize, Option<usize>) {
512        self.data_keys.size_hint()
513    }
514}
515
516impl<K> ExactSizeIterator for UniqueValues<'_, K> {
517    #[inline]
518    fn len(&self) -> usize {
519        self.data_keys.len()
520    }
521}
522
523impl<K> std::iter::FusedIterator for UniqueValues<'_, K> {}
524
525#[cfg(test)]
526mod test {
527    use super::Frequencies;
528    use std::iter::FromIterator;
529
530    #[test]
531    fn ranked() {
532        let mut counts = Frequencies::new();
533        counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
534        let (most_count, most_total) = counts.most_frequent();
535        assert_eq!(most_count[0], (&2, 5));
536        assert_eq!(most_total, 11);
537        let (least_count, least_total) = counts.least_frequent();
538        assert_eq!(least_count[0], (&3, 1));
539        assert_eq!(least_total, 11);
540        assert_eq!(
541            counts.least_frequent(),
542            (vec![(&3, 1), (&1, 2), (&4, 3), (&2, 5)], 11)
543        );
544    }
545
546    #[test]
547    fn ranked2() {
548        let mut counts = Frequencies::new();
549        counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
550        let (most_count, most_total) = counts.par_frequent(false);
551        assert_eq!(most_count[0], (&2, 5));
552        assert_eq!(most_total, 11);
553        let (least_count, least_total) = counts.par_frequent(true);
554        assert_eq!(least_count[0], (&3, 1));
555        assert_eq!(least_total, 11);
556    }
557
558    #[test]
559    fn unique_values() {
560        let freqs = Frequencies::from_iter(vec![8, 6, 5, 1, 1, 2, 2, 2, 3, 4, 7, 4, 4]);
561        let mut unique: Vec<isize> = freqs.unique_values().copied().collect();
562        unique.sort_unstable();
563        assert_eq!(unique, vec![1, 2, 3, 4, 5, 6, 7, 8]);
564    }
565
566    #[test]
567    fn test_top_n() {
568        let mut freq = Frequencies::new();
569        freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
570
571        let top_2 = freq.top_n(2);
572        assert_eq!(top_2.len(), 2);
573        assert_eq!(top_2[0], (&4, 4)); // Most frequent
574        assert_eq!(top_2[1], (&1, 3)); // Second most frequent
575
576        let bottom_2 = freq.bottom_n(2);
577        assert_eq!(bottom_2.len(), 2);
578        assert_eq!(bottom_2[0], (&3, 1)); // Least frequent
579        assert_eq!(bottom_2[1], (&2, 2)); // Second least frequent
580    }
581
582    // top_n/bottom_n must select the SAME set and order as
583    // par_frequent(..) truncated to n, including the tie-break at the
584    // n/n+1 boundary (count primary, value ascending on ties). This is the
585    // invariant qsv's `frequency --limit N` fast path relies on.
586    #[test]
587    fn top_n_matches_par_frequent_all_ties() {
588        // All counts equal: every comparison is a pure value tie-break.
589        let mut freq = Frequencies::new();
590        freq.extend(vec![1usize, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
591        for n in 0..=10usize {
592            let (full_desc, _) = freq.par_frequent(false);
593            let expected_desc: Vec<(&usize, u64)> = full_desc.into_iter().take(n).collect();
594            assert_eq!(freq.top_n(n), expected_desc, "top_n({n}) all-ties mismatch");
595
596            let (full_asc, _) = freq.par_frequent(true);
597            let expected_asc: Vec<(&usize, u64)> = full_asc.into_iter().take(n).collect();
598            assert_eq!(
599                freq.bottom_n(n),
600                expected_asc,
601                "bottom_n({n}) all-ties mismatch"
602            );
603        }
604    }
605
606    #[test]
607    fn top_n_matches_par_frequent_boundary_ties() {
608        // Counts deliberately tie across the cutoff: values 10..19 all count 2,
609        // values 20..24 count 5, values 30..34 count 1.
610        let mut freq = Frequencies::new();
611        for v in 10..20usize {
612            freq.extend(vec![v, v]); // count 2
613        }
614        for v in 20..25usize {
615            freq.extend(vec![v; 5]); // count 5
616        }
617        for v in 30..35usize {
618            freq.extend(vec![v]); // count 1
619        }
620        for n in 0..=freq.len() + 2 {
621            let (full_desc, _) = freq.par_frequent(false);
622            let expected_desc: Vec<(&usize, u64)> = full_desc.into_iter().take(n).collect();
623            assert_eq!(
624                freq.top_n(n),
625                expected_desc,
626                "top_n({n}) boundary-tie mismatch"
627            );
628
629            let (full_asc, _) = freq.par_frequent(true);
630            let expected_asc: Vec<(&usize, u64)> = full_asc.into_iter().take(n).collect();
631            assert_eq!(
632                freq.bottom_n(n),
633                expected_asc,
634                "bottom_n({n}) boundary-tie mismatch"
635            );
636        }
637    }
638
639    #[test]
640    fn test_count_methods() {
641        let mut freq = Frequencies::new();
642        freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
643
644        // Test total_count()
645        assert_eq!(freq.total_count(), 10);
646
647        // Test has_count()
648        assert!(freq.has_count(3)); // 1 appears 3 times
649        assert!(freq.has_count(4)); // 4 appears 4 times
650        assert!(freq.has_count(1)); // 3 appears 1 time
651        assert!(!freq.has_count(5)); // No element appears 5 times
652
653        // Test items_with_count()
654        let items_with_3 = freq.items_with_count(3);
655        assert_eq!(items_with_3, vec![&1]); // Only 1 appears 3 times
656
657        let items_with_2 = freq.items_with_count(2);
658        assert_eq!(items_with_2, vec![&2]); // Only 2 appears 2 times
659
660        let items_with_1 = freq.items_with_count(1);
661        assert_eq!(items_with_1, vec![&3]); // Only 3 appears 1 time
662
663        let items_with_4 = freq.items_with_count(4);
664        assert_eq!(items_with_4, vec![&4]); // Only 4 appears 4 times
665
666        let items_with_5 = freq.items_with_count(5);
667        assert!(items_with_5.is_empty()); // No elements appear 5 times
668    }
669
670    #[test]
671    fn add_borrowed_inserts_new_key() {
672        let mut freq = Frequencies::<Vec<u8>>::new();
673        freq.add_borrowed(b"hello");
674        assert_eq!(freq.count(&b"hello".to_vec()), 1);
675        assert_eq!(freq.cardinality(), 1);
676    }
677
678    #[test]
679    fn add_borrowed_increments_existing_key() {
680        let mut freq = Frequencies::<Vec<u8>>::new();
681        freq.add_borrowed(b"hello");
682        freq.add_borrowed(b"hello");
683        freq.add_borrowed(b"hello");
684        assert_eq!(freq.count(&b"hello".to_vec()), 3);
685        assert_eq!(freq.cardinality(), 1);
686
687        // Also test increment_by_borrowed
688        freq.increment_by_borrowed(b"world", 5);
689        assert_eq!(freq.count(&b"world".to_vec()), 5);
690        freq.increment_by_borrowed(b"world", 3);
691        assert_eq!(freq.count(&b"world".to_vec()), 8);
692    }
693
694    #[test]
695    fn borrowed_owned_interop_for_same_key() {
696        let mut freq = Frequencies::<Vec<u8>>::new();
697        // Insert via owned add
698        freq.add(b"key".to_vec());
699        // Increment via borrowed add
700        freq.add_borrowed(b"key");
701        freq.increment_by_borrowed(b"key", 3);
702        // All methods should see the same accumulated count
703        assert_eq!(freq.count(&b"key".to_vec()), 5);
704        assert_eq!(freq.cardinality(), 1);
705    }
706
707    /// Property test: `Frequencies::modes_antimodes` must produce results
708    /// identical to `Unsorted::modes_antimodes` for the same multiset of
709    /// samples, and the cardinalities must match. This is the behavior
710    /// preservation proof for replacing the `Unsorted<Vec<u8>>` modes tracker
711    /// with a `Frequencies<Vec<u8>>` counted-runs map.
712    #[test]
713    fn modes_antimodes_matches_unsorted() {
714        // simple deterministic LCG so the test needs no rand dependency
715        let mut seed = 0xDEAD_BEEF_u64;
716        let mut next = move |bound: u64| {
717            seed = seed.wrapping_mul(6_364_136_223_846_793_005).wrapping_add(1);
718            (seed >> 33) % bound
719        };
720
721        for case in 0..200 {
722            // vary size and cardinality to hit the special cases:
723            // empty, single unique, all unique, ties, skewed
724            let n_samples = next(50) as usize;
725            let key_space = 1 + next(30);
726
727            let mut unsorted = crate::Unsorted::<Vec<u8>>::default();
728            let mut freqs = Frequencies::<Vec<u8>>::new();
729            for _ in 0..n_samples {
730                let key = format!("k{:02}", next(key_space)).into_bytes();
731                unsorted.add_bytes(&key);
732                freqs.add_borrowed(&key);
733            }
734
735            assert_eq!(
736                unsorted.cardinality(false, 1),
737                freqs.cardinality(),
738                "cardinality mismatch in case {case}"
739            );
740            assert_eq!(
741                unsorted.modes_antimodes(),
742                freqs.modes_antimodes(),
743                "modes/antimodes mismatch in case {case} (n={n_samples}, k={key_space})"
744            );
745        }
746
747        // explicit edge cases
748        // empty
749        let mut u = crate::Unsorted::<Vec<u8>>::default();
750        let f = Frequencies::<Vec<u8>>::new();
751        assert_eq!(u.modes_antimodes(), f.modes_antimodes());
752
753        // single unique value, multiple occurrences
754        let mut u = crate::Unsorted::<Vec<u8>>::default();
755        let mut f = Frequencies::<Vec<u8>>::new();
756        for _ in 0..5 {
757            u.add_bytes(b"only");
758            f.add_borrowed(b"only");
759        }
760        assert_eq!(u.modes_antimodes(), f.modes_antimodes());
761
762        // all values unique (highest_count == 1), more than 10 antimodes
763        let mut u = crate::Unsorted::<Vec<u8>>::default();
764        let mut f = Frequencies::<Vec<u8>>::new();
765        for i in 0..15 {
766            let key = format!("u{i:02}").into_bytes();
767            u.add_bytes(&key);
768            f.add_borrowed(&key);
769        }
770        assert_eq!(u.modes_antimodes(), f.modes_antimodes());
771    }
772
773    /// Regression test: distinct counts above u32::MAX must not collapse into
774    /// ties during mode/antimode selection. Selection compares full u64
775    /// counts; only the returned occurrence counts saturate at u32::MAX.
776    #[test]
777    fn modes_antimodes_counts_above_u32_max() {
778        let big = u64::from(u32::MAX) + 10;
779
780        let mut f = Frequencies::new();
781        f.increment_by(b"big".to_vec(), big);
782        f.increment_by(b"bigger".to_vec(), big + 5);
783        f.increment_by(b"small".to_vec(), 3);
784
785        let ((modes, modes_count, mode_occ), (antimodes, anti_count, anti_occ)) =
786            f.modes_antimodes();
787        // "bigger" alone is the mode - under u32 saturation big and bigger
788        // would have tied and both been returned
789        assert_eq!(modes, vec![b"bigger".to_vec()]);
790        assert_eq!(modes_count, 1);
791        assert_eq!(mode_occ, u32::MAX); // reported occurrence saturates
792        assert_eq!(antimodes, vec![b"small".to_vec()]);
793        assert_eq!(anti_count, 1);
794        assert_eq!(anti_occ, 3);
795
796        // both counts above u32::MAX: lowest also saturates in the report,
797        // but selection still distinguishes them
798        let mut f2 = Frequencies::new();
799        f2.increment_by(b"a".to_vec(), big);
800        f2.increment_by(b"b".to_vec(), big + 1);
801        let ((modes2, _, mode_occ2), (antimodes2, _, anti_occ2)) = f2.modes_antimodes();
802        assert_eq!(modes2, vec![b"b".to_vec()]);
803        assert_eq!(antimodes2, vec![b"a".to_vec()]);
804        assert_eq!(mode_occ2, u32::MAX);
805        assert_eq!(anti_occ2, u32::MAX);
806    }
807}
stats/frequency.rs

stats/
frequency.rs