use std::collections::hash_map::{Entry, Keys};
use std::hash::Hash;
use foldhash::{HashMap, HashMapExt};
use rayon::prelude::*;
use crate::Commute;
const PARALLEL_THRESHOLD: usize = 10_000;
#[derive(Clone)]
pub struct Frequencies<T> {
data: HashMap<T, u64>,
}
#[cfg(debug_assertions)]
impl<T: std::fmt::Debug + Eq + Hash> std::fmt::Debug for Frequencies<T> {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "{:?}", self.data)
}
}
impl<T: Eq + Hash> Frequencies<T> {
#[must_use]
pub fn new() -> Frequencies<T> {
Default::default()
}
#[must_use]
pub fn with_capacity(capacity: usize) -> Self {
Frequencies {
data: HashMap::with_capacity(capacity),
}
}
#[inline]
pub fn add(&mut self, v: T) {
*self.data.entry(v).or_insert(0) += 1;
}
#[inline]
#[must_use]
pub fn count(&self, v: &T) -> u64 {
self.data.get(v).copied().unwrap_or(0)
}
#[inline]
#[must_use]
pub fn cardinality(&self) -> u64 {
self.len() as u64
}
fn collect_counts(&self) -> (Vec<(&T, u64)>, u64) {
let mut total_count = 0u64;
let counts: Vec<(&T, u64)> = self
.data
.iter()
.map(|(k, &v)| {
total_count += v;
(k, v)
})
.collect();
(counts, total_count)
}
#[inline]
#[must_use]
pub fn most_frequent(&self) -> (Vec<(&T, u64)>, u64) {
let (mut counts, total_count) = self.collect_counts();
counts.sort_unstable_by_key(|&(_, c)| std::cmp::Reverse(c));
(counts, total_count)
}
#[inline]
#[must_use]
pub fn least_frequent(&self) -> (Vec<(&T, u64)>, u64) {
let (mut counts, total_count) = self.collect_counts();
counts.sort_unstable_by_key(|&(_, c)| c);
(counts, total_count)
}
#[inline]
#[must_use]
pub fn par_frequent(&self, least: bool) -> (Vec<(&T, u64)>, u64)
where
for<'a> (&'a T, u64): Send,
T: Ord,
{
let (mut counts, total_count) = self.collect_counts();
if least {
let sort_fn =
|&(v1, c1): &(&T, u64), &(v2, c2): &(&T, u64)| c1.cmp(&c2).then_with(|| v1.cmp(v2));
if counts.len() < PARALLEL_THRESHOLD {
counts.sort_unstable_by(sort_fn);
} else {
counts.par_sort_unstable_by(sort_fn);
}
} else {
let sort_fn =
|&(v1, c1): &(&T, u64), &(v2, c2): &(&T, u64)| c2.cmp(&c1).then_with(|| v1.cmp(v2));
if counts.len() < PARALLEL_THRESHOLD {
counts.sort_unstable_by(sort_fn);
} else {
counts.par_sort_unstable_by(sort_fn);
}
}
(counts, total_count)
}
#[must_use]
pub fn len(&self) -> usize {
self.data.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.data.is_empty()
}
#[must_use]
pub fn unique_values(&self) -> UniqueValues<'_, T> {
UniqueValues {
data_keys: self.data.keys(),
}
}
#[must_use]
pub fn top_n(&self, n: usize) -> Vec<(&T, u64)>
where
T: Ord,
{
use std::collections::BinaryHeap;
let mut heap = BinaryHeap::with_capacity(n);
for (item, count) in &self.data {
let candidate = (*count, item);
if heap.len() < n {
heap.push(std::cmp::Reverse(candidate));
} else if let Some(top) = heap.peek()
&& candidate > top.0
{
heap.pop();
heap.push(std::cmp::Reverse(candidate));
}
}
heap.into_sorted_vec()
.into_iter()
.map(|std::cmp::Reverse((count, item))| (item, count))
.collect()
}
#[must_use]
pub fn bottom_n(&self, n: usize) -> Vec<(&T, u64)>
where
T: Ord,
{
use std::collections::BinaryHeap;
let mut heap = BinaryHeap::with_capacity(n);
for (item, count) in &self.data {
let candidate = (*count, item);
if heap.len() < n {
heap.push(candidate);
} else if let Some(top) = heap.peek()
&& candidate < *top
{
heap.pop();
heap.push(candidate);
}
}
heap.into_sorted_vec()
.into_iter()
.map(|(count, item)| (item, count))
.collect()
}
#[must_use]
pub fn items_with_count(&self, n: u64) -> Vec<&T> {
self.data
.iter()
.filter(|&(_, &count)| count == n)
.map(|(item, _)| item)
.collect()
}
#[must_use]
pub fn total_count(&self) -> u64 {
self.data.values().sum()
}
#[must_use]
pub fn has_count(&self, n: u64) -> bool {
self.data.values().any(|&count| count == n)
}
#[inline]
pub fn increment_by(&mut self, v: T, count: u64) {
match self.data.entry(v) {
Entry::Vacant(entry) => {
entry.insert(count);
}
Entry::Occupied(mut entry) => {
*entry.get_mut() += count;
}
}
}
}
impl Frequencies<Vec<u8>> {
#[inline]
pub fn add_borrowed(&mut self, v: &[u8]) {
if let Some(count) = self.data.get_mut(v) {
*count += 1;
} else {
self.data.insert(v.to_vec(), 1);
}
}
#[inline]
pub fn increment_by_borrowed(&mut self, v: &[u8], count: u64) {
if let Some(existing) = self.data.get_mut(v) {
*existing += count;
} else {
self.data.insert(v.to_vec(), count);
}
}
}
impl<T: Eq + Hash> Commute for Frequencies<T> {
#[inline]
fn merge(&mut self, v: Frequencies<T>) {
self.data.reserve(v.data.len());
for (k, v2) in v.data {
match self.data.entry(k) {
Entry::Vacant(v1) => {
v1.insert(v2);
}
Entry::Occupied(mut v1) => {
*v1.get_mut() += v2;
}
}
}
}
}
impl<T: Eq + Hash> Default for Frequencies<T> {
#[inline]
fn default() -> Frequencies<T> {
Frequencies {
data: HashMap::with_capacity(64),
}
}
}
impl<T: Eq + Hash> FromIterator<T> for Frequencies<T> {
#[inline]
fn from_iter<I: IntoIterator<Item = T>>(it: I) -> Frequencies<T> {
let mut v = Frequencies::new();
v.extend(it);
v
}
}
impl<T: Eq + Hash> Extend<T> for Frequencies<T> {
#[inline]
fn extend<I: IntoIterator<Item = T>>(&mut self, it: I) {
let iter = it.into_iter();
if let (lower, Some(upper)) = iter.size_hint()
&& lower == upper
{
self.data.reserve(lower.saturating_sub(self.data.len()));
}
for sample in iter {
self.add(sample);
}
}
}
pub struct UniqueValues<'a, K> {
data_keys: Keys<'a, K, u64>,
}
impl<'a, K> Iterator for UniqueValues<'a, K> {
type Item = &'a K;
fn next(&mut self) -> Option<Self::Item> {
self.data_keys.next()
}
}
#[cfg(test)]
mod test {
use super::Frequencies;
use std::iter::FromIterator;
#[test]
fn ranked() {
let mut counts = Frequencies::new();
counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
let (most_count, most_total) = counts.most_frequent();
assert_eq!(most_count[0], (&2, 5));
assert_eq!(most_total, 11);
let (least_count, least_total) = counts.least_frequent();
assert_eq!(least_count[0], (&3, 1));
assert_eq!(least_total, 11);
assert_eq!(
counts.least_frequent(),
(vec![(&3, 1), (&1, 2), (&4, 3), (&2, 5)], 11)
);
}
#[test]
fn ranked2() {
let mut counts = Frequencies::new();
counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]);
let (most_count, most_total) = counts.par_frequent(false);
assert_eq!(most_count[0], (&2, 5));
assert_eq!(most_total, 11);
let (least_count, least_total) = counts.par_frequent(true);
assert_eq!(least_count[0], (&3, 1));
assert_eq!(least_total, 11);
}
#[test]
fn unique_values() {
let freqs = Frequencies::from_iter(vec![8, 6, 5, 1, 1, 2, 2, 2, 3, 4, 7, 4, 4]);
let mut unique: Vec<isize> = freqs.unique_values().copied().collect();
unique.sort_unstable();
assert_eq!(unique, vec![1, 2, 3, 4, 5, 6, 7, 8]);
}
#[test]
fn test_top_n() {
let mut freq = Frequencies::new();
freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
let top_2 = freq.top_n(2);
assert_eq!(top_2.len(), 2);
assert_eq!(top_2[0], (&4, 4)); assert_eq!(top_2[1], (&1, 3));
let bottom_2 = freq.bottom_n(2);
assert_eq!(bottom_2.len(), 2);
assert_eq!(bottom_2[0], (&3, 1)); assert_eq!(bottom_2[1], (&2, 2)); }
#[test]
fn test_count_methods() {
let mut freq = Frequencies::new();
freq.extend(vec![1, 1, 1, 2, 2, 3, 4, 4, 4, 4]);
assert_eq!(freq.total_count(), 10);
assert!(freq.has_count(3)); assert!(freq.has_count(4)); assert!(freq.has_count(1)); assert!(!freq.has_count(5));
let items_with_3 = freq.items_with_count(3);
assert_eq!(items_with_3, vec![&1]);
let items_with_2 = freq.items_with_count(2);
assert_eq!(items_with_2, vec![&2]);
let items_with_1 = freq.items_with_count(1);
assert_eq!(items_with_1, vec![&3]);
let items_with_4 = freq.items_with_count(4);
assert_eq!(items_with_4, vec![&4]);
let items_with_5 = freq.items_with_count(5);
assert!(items_with_5.is_empty()); }
#[test]
fn add_borrowed_inserts_new_key() {
let mut freq = Frequencies::<Vec<u8>>::new();
freq.add_borrowed(b"hello");
assert_eq!(freq.count(&b"hello".to_vec()), 1);
assert_eq!(freq.cardinality(), 1);
}
#[test]
fn add_borrowed_increments_existing_key() {
let mut freq = Frequencies::<Vec<u8>>::new();
freq.add_borrowed(b"hello");
freq.add_borrowed(b"hello");
freq.add_borrowed(b"hello");
assert_eq!(freq.count(&b"hello".to_vec()), 3);
assert_eq!(freq.cardinality(), 1);
freq.increment_by_borrowed(b"world", 5);
assert_eq!(freq.count(&b"world".to_vec()), 5);
freq.increment_by_borrowed(b"world", 3);
assert_eq!(freq.count(&b"world".to_vec()), 8);
}
#[test]
fn borrowed_owned_interop_for_same_key() {
let mut freq = Frequencies::<Vec<u8>>::new();
freq.add(b"key".to_vec());
freq.add_borrowed(b"key");
freq.increment_by_borrowed(b"key", 3);
assert_eq!(freq.count(&b"key".to_vec()), 5);
assert_eq!(freq.cardinality(), 1);
}
}