1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
// -*- coding: utf-8 -*-
// ------------------------------------------------------------------------------------------------
// Copyright © 2023, stack-graphs authors.
// Licensed under either of Apache License, Version 2.0, or MIT license, at your option.
// Please see the LICENSE-APACHE or LICENSE-MIT files in this distribution for license details.
// ------------------------------------------------------------------------------------------------

use std::collections::HashMap;
use std::hash::Hash;

use itertools::Itertools;

/// Frequency distribution maintains the frequency of T values.
#[derive(Clone, Debug, Default)]
pub struct FrequencyDistribution<T>
where
    T: Eq + Hash,
{
    values: HashMap<T, usize>,
    total: usize,
}

impl<T: Eq + Hash> FrequencyDistribution<T> {
    pub fn record(&mut self, value: T) {
        *self.values.entry(value).or_default() += 1;
        self.total += 1;
    }

    // The number of recorded values.
    pub fn count(&self) -> usize {
        return self.total;
    }

    // The number of unique recorded values.
    pub fn unique(&self) -> usize {
        return self.values.len();
    }

    pub fn frequencies(&self) -> FrequencyDistribution<usize> {
        let mut fs = FrequencyDistribution::default();
        for count in self.values.values() {
            fs.record(*count);
        }
        fs
    }
}

impl<T: Eq + Hash + Ord> FrequencyDistribution<T> {
    pub fn quantiles(&self, q: usize) -> Vec<&T> {
        if q == 0 || self.total == 0 {
            return vec![];
        }

        let mut it = self.values.iter().sorted_by_key(|e| e.0);
        let mut total_count = 0;
        let mut last_value;
        let mut result = Vec::new();

        if let Some((value, count)) = it.next() {
            total_count += count;
            last_value = value;
        } else {
            return vec![];
        }
        result.push(last_value);

        for k in 1..=q {
            let limit = ((self.total as f64 * k as f64) / q as f64).round() as usize;
            while total_count < limit {
                if let Some((value, count)) = it.next() {
                    total_count += count;
                    last_value = value;
                } else {
                    break;
                }
            }
            result.push(last_value);
        }

        result
    }
}

impl<T> std::ops::AddAssign<Self> for FrequencyDistribution<T>
where
    T: Eq + Hash,
{
    fn add_assign(&mut self, rhs: Self) {
        for (value, count) in rhs.values {
            *self.values.entry(value).or_default() += count;
        }
        self.total += rhs.total;
    }
}

impl<T> std::ops::AddAssign<&Self> for FrequencyDistribution<T>
where
    T: Eq + Hash + Clone,
{
    fn add_assign(&mut self, rhs: &Self) {
        for (value, count) in &rhs.values {
            *self.values.entry(value.clone()).or_default() += count;
        }
        self.total += rhs.total;
    }
}