disty_cli/
stats.rs

1/// Pre-computed statistics over sorted dataset.
2/// Data is kept sorted to enable efficient quantile lookups & binary search.
3pub struct Stats {
4    pub data: Vec<f64>,
5    pub n: usize,
6    pub sum: f64,
7    pub mean: f64,
8    pub geo_mean: f64,
9    pub variance: f64,
10    pub std_dev: f64,
11}
12
13impl Stats {
14    pub fn new(mut data: Vec<f64>) -> Self {
15        data.sort_by(|a, b| a.partial_cmp(b).unwrap());
16
17        let n = data.len();
18        let sum: f64 = data.iter().sum();
19        let mean = sum / n as f64;
20
21        let geo_mean = if data.iter().all(|&x| x > 0.0) {
22            let log_sum: f64 = data.iter().map(|x| x.ln()).sum();
23            (log_sum / n as f64).exp()
24        } else {
25            f64::NAN
26        };
27
28        let variance = data.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / n as f64;
29        let std_dev = variance.sqrt();
30
31        Stats {
32            data,
33            n,
34            sum,
35            mean,
36            geo_mean,
37            variance,
38            std_dev,
39        }
40    }
41
42    /// Calculate quantile (0.0 = min, 0.5 = median, 1.0 = max)
43    pub fn quantile(&self, q: f64) -> f64 {
44        if self.data.is_empty() {
45            return f64::NAN;
46        }
47        if q <= 0.0 {
48            return self.data[0];
49        }
50        if q >= 1.0 {
51            return self.data[self.n - 1];
52        }
53
54        // Linear interpolation between closest ranks
55        let rank = q * (self.n - 1) as f64;
56        let lower = rank.floor() as usize;
57        let upper = rank.ceil() as usize;
58        let fraction = rank - lower as f64;
59
60        self.data[lower] * (1.0 - fraction) + self.data[upper] * fraction
61    }
62}
63
64#[cfg(test)]
65mod tests {
66    use super::*;
67
68    #[test]
69    fn test_stats_basic() {
70        let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
71        let stats = Stats::new(data);
72
73        assert_eq!(stats.n, 5);
74        assert_eq!(stats.sum, 15.0);
75        assert_eq!(stats.mean, 3.0);
76    }
77
78    #[test]
79    fn test_stats_sorted() {
80        let data = vec![5.0, 2.0, 4.0, 1.0, 3.0];
81        let stats = Stats::new(data);
82
83        // Data should be sorted
84        assert_eq!(stats.data, vec![1.0, 2.0, 3.0, 4.0, 5.0]);
85    }
86
87    #[test]
88    fn test_stats_variance_and_stddev() {
89        let data = vec![2.0, 4.0, 6.0, 8.0, 10.0];
90        let stats = Stats::new(data);
91
92        // Mean = 6.0
93        // Variance = ((2-6)² + (4-6)² + (6-6)² + (8-6)² + (10-6)²) / 5
94        //          = (16 + 4 + 0 + 4 + 16) / 5 = 40 / 5 = 8.0
95        assert_eq!(stats.mean, 6.0);
96        assert_eq!(stats.variance, 8.0);
97        assert!((stats.std_dev - 8.0_f64.sqrt()).abs() < 1e-10);
98    }
99
100    #[test]
101    fn test_stats_geometric_mean() {
102        let data = vec![1.0, 2.0, 4.0, 8.0];
103        let stats = Stats::new(data);
104
105        // Geometric mean = (1 * 2 * 4 * 8)^(1/4) = 64^(1/4) = 2.828...
106        let expected_gmean = (1.0 * 2.0 * 4.0 * 8.0_f64).powf(0.25);
107        assert!((stats.geo_mean - expected_gmean).abs() < 1e-10);
108    }
109
110    #[test]
111    fn test_stats_geometric_mean_with_zero() {
112        let data = vec![0.0, 1.0, 2.0, 3.0];
113        let stats = Stats::new(data);
114
115        // Geometric mean is undefined for data containing 0 or negative
116        assert!(stats.geo_mean.is_nan());
117    }
118
119    #[test]
120    fn test_stats_geometric_mean_with_negative() {
121        let data = vec![-1.0, 1.0, 2.0, 3.0];
122        let stats = Stats::new(data);
123
124        // Geometric mean is undefined for data containing negative numbers
125        assert!(stats.geo_mean.is_nan());
126    }
127
128    #[test]
129    fn test_quantile_min() {
130        let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
131        let stats = Stats::new(data);
132
133        assert_eq!(stats.quantile(0.0), 1.0);
134    }
135
136    #[test]
137    fn test_quantile_max() {
138        let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
139        let stats = Stats::new(data);
140
141        assert_eq!(stats.quantile(1.0), 5.0);
142    }
143
144    #[test]
145    fn test_quantile_median() {
146        let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
147        let stats = Stats::new(data);
148
149        assert_eq!(stats.quantile(0.5), 3.0);
150    }
151
152    #[test]
153    fn test_quantile_interpolation() {
154        let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
155        let stats = Stats::new(data);
156
157        // 25th percentile: between index 1 (value 2.0) and index 2 (value 3.0)
158        // Linear interpolation at 0.25: 2.0 * 0.75 + 3.0 * 0.25 = 2.25
159        let q25 = stats.quantile(0.25);
160        assert!((q25 - 2.0).abs() < 1e-10);
161
162        // 75th percentile: between index 3 (value 4.0) and index 4 (value 5.0)
163        let q75 = stats.quantile(0.75);
164        assert!((q75 - 4.0).abs() < 1e-10);
165    }
166
167    #[test]
168    fn test_quantile_even_number_of_values() {
169        let data = vec![1.0, 2.0, 3.0, 4.0];
170        let stats = Stats::new(data);
171
172        // Median should be between 2.0 and 3.0
173        let median = stats.quantile(0.5);
174        assert_eq!(median, 2.5);
175    }
176
177    #[test]
178    fn test_quantile_empty_data() {
179        let data = vec![];
180        let stats = Stats::new(data);
181
182        assert!(stats.quantile(0.5).is_nan());
183    }
184
185    #[test]
186    fn test_quantile_single_value() {
187        let data = vec![42.0];
188        let stats = Stats::new(data);
189
190        assert_eq!(stats.quantile(0.0), 42.0);
191        assert_eq!(stats.quantile(0.5), 42.0);
192        assert_eq!(stats.quantile(1.0), 42.0);
193    }
194
195    #[test]
196    fn test_quantile_negative_q() {
197        let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
198        let stats = Stats::new(data);
199
200        // Negative q should return min
201        assert_eq!(stats.quantile(-0.5), 1.0);
202    }
203
204    #[test]
205    fn test_quantile_q_greater_than_one() {
206        let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
207        let stats = Stats::new(data);
208
209        // q > 1.0 should return max
210        assert_eq!(stats.quantile(1.5), 5.0);
211    }
212
213    #[test]
214    fn test_stats_with_duplicates() {
215        let data = vec![1.0, 2.0, 2.0, 2.0, 5.0];
216        let stats = Stats::new(data);
217
218        assert_eq!(stats.n, 5);
219        assert_eq!(stats.sum, 12.0);
220        assert_eq!(stats.mean, 2.4);
221        assert_eq!(stats.quantile(0.5), 2.0);
222    }
223
224    #[test]
225    fn test_stats_all_same_values() {
226        let data = vec![5.0, 5.0, 5.0, 5.0, 5.0];
227        let stats = Stats::new(data);
228
229        assert_eq!(stats.mean, 5.0);
230        assert_eq!(stats.variance, 0.0);
231        assert_eq!(stats.std_dev, 0.0);
232        assert_eq!(stats.quantile(0.0), 5.0);
233        assert_eq!(stats.quantile(0.5), 5.0);
234        assert_eq!(stats.quantile(1.0), 5.0);
235    }
236
237    #[test]
238    fn test_stats_large_range() {
239        let data = vec![1.0, 1000.0, 1000000.0];
240        let stats = Stats::new(data);
241
242        assert_eq!(stats.n, 3);
243        assert_eq!(stats.sum, 1001001.0);
244        assert!((stats.mean - 333667.0).abs() < 1.0);
245    }
246}