rs_sci/
stats.rs

1use crate::consts::Const;
2use std::collections::HashMap;
3
4#[derive(Debug, Clone)]
5pub struct Stats;
6
7impl Stats {
8    // Central Tendency Measures
9    pub fn mean(data: &[f64]) -> Option<f64> {
10        if data.is_empty() {
11            return None;
12        }
13        Some(data.iter().sum::<f64>() / data.len() as f64)
14    }
15
16    pub fn median(data: &[f64]) -> Option<f64> {
17        if data.is_empty() {
18            return None;
19        }
20        let mut sorted = data.to_vec();
21        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
22        let mid = sorted.len() / 2;
23        if sorted.len() % 2 == 0 {
24            Some((sorted[mid - 1] + sorted[mid]) / 2.0)
25        } else {
26            Some(sorted[mid])
27        }
28    }
29
30    pub fn mode(data: &[f64]) -> Option<Vec<f64>> {
31        if data.is_empty() {
32            return None;
33        }
34        let mut freq_map: HashMap<String, usize> = HashMap::new();
35
36        // Convert to strings to handle floating point comparison
37        for value in data {
38            let key = format!("{:.10}", value);
39            *freq_map.entry(key).or_insert(0) += 1;
40        }
41
42        let max_freq = freq_map.values().max()?;
43        let modes: Vec<f64> = freq_map
44            .iter()
45            .filter(|(_, &count)| count == *max_freq)
46            .map(|(key, _)| key.parse::<f64>().unwrap())
47            .collect();
48
49        Some(modes)
50    }
51
52    // Dispersion Measures
53    pub fn variance(data: &[f64]) -> Option<f64> {
54        if data.len() < 2 {
55            return None;
56        }
57        let mean = Stats::mean(data)?;
58        let squared_diff_sum: f64 = data.iter().map(|&x| (x - mean).powi(2)).sum();
59        Some(squared_diff_sum / (data.len() - 1) as f64)
60    }
61
62    pub fn std_dev(data: &[f64]) -> Option<f64> {
63        Some(Stats::variance(data)?.sqrt())
64    }
65
66    pub fn range(data: &[f64]) -> Option<f64> {
67        if data.is_empty() {
68            return None;
69        }
70        let min = data.iter().min_by(|a, b| a.partial_cmp(b).unwrap())?;
71        let max = data.iter().max_by(|a, b| a.partial_cmp(b).unwrap())?;
72        Some(max - min)
73    }
74
75    // Quartiles and Percentiles
76    pub fn quartiles(data: &[f64]) -> Option<(f64, f64, f64)> {
77        if data.is_empty() {
78            return None;
79        }
80        let mut sorted = data.to_vec();
81        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
82
83        let q2 = Stats::median(&sorted)?;
84        let (lower, upper) = sorted.split_at(sorted.len() / 2);
85        let q1 = Stats::median(lower)?;
86        let q3 = Stats::median(if sorted.len() % 2 == 0 {
87            upper
88        } else {
89            &upper[1..]
90        })?;
91
92        Some((q1, q2, q3))
93    }
94
95    pub fn percentile(data: &[f64], p: f64) -> Option<f64> {
96        if data.is_empty() || p < 0.0 || p > 100.0 {
97            return None;
98        }
99        let mut sorted = data.to_vec();
100        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
101
102        let rank = p / 100.0 * (sorted.len() - 1) as f64;
103        let lower_idx = rank.floor() as usize;
104        let upper_idx = rank.ceil() as usize;
105
106        if lower_idx == upper_idx {
107            Some(sorted[lower_idx])
108        } else {
109            let weight = rank - lower_idx as f64;
110            Some(sorted[lower_idx] * (1.0 - weight) + sorted[upper_idx] * weight)
111        }
112    }
113
114    // Distribution Characteristics
115    pub fn skewness(data: &[f64]) -> Option<f64> {
116        if data.len() < 3 {
117            return None;
118        }
119        let mean = Stats::mean(data)?;
120        let std_dev = Stats::std_dev(data)?;
121        let n = data.len() as f64;
122
123        let m3 = data.iter().map(|&x| (x - mean).powi(3)).sum::<f64>() / n;
124
125        Some(m3 / std_dev.powi(3))
126    }
127
128    pub fn kurtosis(data: &[f64]) -> Option<f64> {
129        if data.len() < 4 {
130            return None;
131        }
132        let mean = Stats::mean(data)?;
133        let std_dev = Stats::std_dev(data)?;
134        let n = data.len() as f64;
135
136        let m4 = data.iter().map(|&x| (x - mean).powi(4)).sum::<f64>() / n;
137
138        Some(m4 / std_dev.powi(4) - 3.0) // Excess kurtosis
139    }
140
141    // Correlation and Covariance
142    pub fn covariance(x: &[f64], y: &[f64]) -> Option<f64> {
143        if x.len() != y.len() || x.is_empty() {
144            return None;
145        }
146        let mean_x = Stats::mean(x)?;
147        let mean_y = Stats::mean(y)?;
148        let n = x.len() as f64;
149
150        let sum = x
151            .iter()
152            .zip(y.iter())
153            .map(|(&xi, &yi)| (xi - mean_x) * (yi - mean_y))
154            .sum::<f64>();
155
156        Some(sum / (n - 1.0))
157    }
158
159    pub fn correlation(x: &[f64], y: &[f64]) -> Option<f64> {
160        let cov = Stats::covariance(x, y)?;
161        let std_x = Stats::std_dev(x)?;
162        let std_y = Stats::std_dev(y)?;
163
164        Some(cov / (std_x * std_y))
165    }
166
167    // Z-scores and Standardization
168    pub fn z_scores(data: &[f64]) -> Option<Vec<f64>> {
169        let mean = Stats::mean(data)?;
170        let std_dev = Stats::std_dev(data)?;
171
172        Some(data.iter().map(|&x| (x - mean) / std_dev).collect())
173    }
174
175    // Robust Statistics
176    pub fn mad(data: &[f64]) -> Option<f64> {
177        // Median Absolute Deviation
178        let median = Stats::median(data)?;
179        let deviations: Vec<f64> = data.iter().map(|&x| (x - median).abs()).collect();
180        Stats::median(&deviations)
181    }
182
183    pub fn winsorized_mean(data: &[f64], trim: f64) -> Option<f64> {
184        if data.is_empty() || trim < 0.0 || trim > 0.5 {
185            return None;
186        }
187        let mut sorted = data.to_vec();
188        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
189
190        let n = sorted.len();
191        let k = (n as f64 * trim).floor() as usize;
192
193        let lower = sorted[k];
194        let upper = sorted[n - k - 1];
195
196        let sum: f64 = sorted.iter().map(|&x| x.max(lower).min(upper)).sum();
197
198        Some(sum / n as f64)
199    }
200
201    pub fn moving_average(data: &[f64], window: usize) -> Option<Vec<f64>> {
202        if window == 0 || window > data.len() {
203            return None;
204        }
205
206        let mut result = Vec::with_capacity(data.len() - window + 1);
207        for i in 0..=data.len() - window {
208            let mean = Stats::mean(&data[i..i + window])?;
209            result.push(mean);
210        }
211        Some(result)
212    }
213
214    pub fn exponential_smoothing(data: &[f64], alpha: f64) -> Option<Vec<f64>> {
215        if data.is_empty() || alpha < 0.0 || alpha > 1.0 {
216            return None;
217        }
218
219        let mut result = Vec::with_capacity(data.len());
220        result.push(data[0]);
221
222        for i in 1..data.len() {
223            let smoothed = alpha * data[i] + (1.0 - alpha) * result[i - 1];
224            result.push(smoothed);
225        }
226        Some(result)
227    }
228
229    // Multiple Testing Corrections
230    pub fn bonferroni_correction(p_values: &[f64]) -> Option<Vec<f64>> {
231        if p_values.is_empty() {
232            return None;
233        }
234        let n = p_values.len() as f64;
235        Some(p_values.iter().map(|&p| (p * n).min(1.0)).collect())
236    }
237
238    pub fn benjamini_hochberg(p_values: &[f64]) -> Option<Vec<f64>> {
239        if p_values.is_empty() {
240            return None;
241        }
242
243        // Create indexed p-values
244        let mut indexed_p: Vec<(usize, &f64)> = p_values.iter().enumerate().collect();
245        indexed_p.sort_by(|a, b| a.1.partial_cmp(b.1).unwrap());
246
247        let n = p_values.len();
248        let mut adjusted = vec![0.0; n];
249
250        // Calculate adjusted p-values
251        let mut prev = 1.0;
252        for (rank, (orig_idx, &p)) in indexed_p.iter().enumerate().rev() {
253            let adj_p = (p * n as f64 / (rank + 1) as f64).min(prev);
254            adjusted[*orig_idx] = adj_p;
255            prev = adj_p;
256        }
257
258        Some(adjusted)
259    }
260
261    // Kernel Density Estimation
262    pub fn kernel_density_estimation(data: &[f64], x: f64, bandwidth: f64) -> Option<f64> {
263        if data.is_empty() || bandwidth <= 0.0 {
264            return None;
265        }
266
267        let n = data.len() as f64;
268        let density: f64 = data
269            .iter()
270            .map(|&xi| {
271                let u = (x - xi) / bandwidth;
272                Stats::gaussian_kernel(u) / bandwidth
273            })
274            .sum::<f64>()
275            / n;
276
277        Some(density)
278    }
279
280    fn gaussian_kernel(u: f64) -> f64 {
281        (-0.5 * u * u).exp() / (2.0 * Const::PI).sqrt()
282    }
283
284    // Bayesian Statistics
285    pub fn bayesian_update(prior: f64, likelihood: f64, evidence: f64) -> Option<f64> {
286        if evidence == 0.0 || prior < 0.0 || prior > 1.0 || likelihood < 0.0 {
287            return None;
288        }
289        Some(prior * likelihood / evidence)
290    }
291
292    // Information Theory Metrics
293    pub fn entropy(probabilities: &[f64]) -> Option<f64> {
294        if probabilities.is_empty()
295            || probabilities.iter().any(|&p| p < 0.0 || p > 1.0)
296            || (probabilities.iter().sum::<f64>() - 1.0).abs() > 1e-10
297        {
298            return None;
299        }
300
301        Some(
302            -probabilities
303                .iter()
304                .filter(|&&p| p > 0.0)
305                .map(|&p| p * p.log2())
306                .sum::<f64>(),
307        )
308    }
309
310    pub fn kullback_leibler_divergence(p: &[f64], q: &[f64]) -> Option<f64> {
311        if p.len() != q.len() || p.is_empty() {
312            return None;
313        }
314
315        Some(
316            p.iter()
317                .zip(q.iter())
318                .filter(|(&pi, &qi)| pi > 0.0 && qi > 0.0)
319                .map(|(&pi, &qi)| pi * (pi / qi).log2())
320                .sum(),
321        )
322    }
323}
324
325// Summary Statistics
326pub fn summary_statistics(data: &[f64]) -> Option<SummaryStats> {
327    Some(SummaryStats {
328        count: data.len(),
329        mean: Stats::mean(data)?,
330        median: Stats::median(data)?,
331        mode: Stats::mode(data)?,
332        variance: Stats::variance(data)?,
333        std_dev: Stats::std_dev(data)?,
334        skewness: Stats::skewness(data)?,
335        kurtosis: Stats::kurtosis(data)?,
336        range: Stats::range(data)?,
337        quartiles: Stats::quartiles(data)?,
338    })
339}
340
341#[derive(Debug)]
342pub struct SummaryStats {
343    pub count: usize,
344    pub mean: f64,
345    pub median: f64,
346    pub mode: Vec<f64>,
347    pub variance: f64,
348    pub std_dev: f64,
349    pub skewness: f64,
350    pub kurtosis: f64,
351    pub range: f64,
352    pub quartiles: (f64, f64, f64),
353}
354
355// Probability Distributions
356impl Stats {
357    pub fn normal_pdf(x: f64, mean: f64, std_dev: f64) -> f64 {
358        let exponent = -(x - mean).powi(2) / (2.0 * std_dev.powi(2));
359        (1.0 / (std_dev * (2.0 * Const::PI).sqrt())) * exponent.exp()
360    }
361
362    // Effect Size Measures
363    pub fn cohens_d(data1: &[f64], data2: &[f64]) -> Option<f64> {
364        let mean1 = Stats::mean(data1)?;
365        let mean2 = Stats::mean(data2)?;
366        let var1 = Stats::variance(data1)?;
367        let var2 = Stats::variance(data2)?;
368        let pooled_std = ((var1 + var2) / 2.0).sqrt();
369        Some((mean1 - mean2) / pooled_std)
370    }
371
372    // Regression Analysis
373    pub fn linear_regression(x: &[f64], y: &[f64]) -> Option<(f64, f64)> {
374        // Returns (slope, intercept)
375        if x.len() != y.len() || x.is_empty() {
376            return None;
377        }
378
379        let mean_x = Stats::mean(x)?;
380        let mean_y = Stats::mean(y)?;
381        let cov = Stats::covariance(x, y)?;
382        let var_x = Stats::variance(x)?;
383
384        let slope = cov / var_x;
385        let intercept = mean_y - slope * mean_x;
386
387        Some((slope, intercept))
388    }
389
390    // Time Series Analysis
391    pub fn autocorrelation(data: &[f64], lag: usize) -> Option<f64> {
392        if lag >= data.len() {
393            return None;
394        }
395
396        let mean = Stats::mean(data)?;
397        let n = data.len() - lag;
398
399        let numerator: f64 = (0..n)
400            .map(|i| (data[i] - mean) * (data[i + lag] - mean))
401            .sum();
402        let denominator: f64 = data.iter().map(|&x| (x - mean).powi(2)).sum();
403
404        Some(numerator / denominator)
405    }
406
407    pub fn binomial_pmf(k: u64, n: u64, p: f64) -> f64 {
408        if p < 0.0 || p > 1.0 {
409            return 0.0;
410        }
411        let combinations = Stats::combinations(n, k);
412        combinations as f64 * p.powi(k as i32) * (1.0 - p).powi((n - k) as i32)
413    }
414
415    // Helper function for combinations
416    fn combinations(n: u64, k: u64) -> u64 {
417        if k > n {
418            return 0;
419        }
420        let k = k.min(n - k);
421        let mut result = 1;
422        for i in 0..k {
423            result = result * (n - i) / (i + 1);
424        }
425        result
426    }
427}