Skip to main content

so_stats/
descriptive.rs

1//! Descriptive statistics functions
2
3use ndarray::{Array1, Array2};
4use so_core::error::{Error, Result};
5
6/// Compute mean of an array
7pub fn mean(data: &Array1<f64>) -> Option<f64> {
8    if data.is_empty() {
9        return None;
10    }
11    Some(data.sum() / data.len() as f64)
12}
13
14/// Compute variance with given degrees of freedom adjustment
15pub fn variance(data: &Array1<f64>, ddof: f64) -> Option<f64> {
16    let n = data.len() as f64;
17    if n <= ddof {
18        return None;
19    }
20
21    let m = mean(data)?;
22    let sum_sq_diff: f64 = data.iter().map(|&x| (x - m).powi(2)).sum();
23    Some(sum_sq_diff / (n - ddof))
24}
25
26/// Compute standard deviation
27pub fn std(data: &Array1<f64>, ddof: f64) -> Option<f64> {
28    variance(data, ddof).map(|v| v.sqrt())
29}
30
31/// Compute median
32pub fn median(data: &Array1<f64>) -> Option<f64> {
33    if data.is_empty() {
34        return None;
35    }
36
37    let mut sorted: Vec<f64> = data.to_vec();
38    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
39
40    let n = sorted.len();
41    if n % 2 == 1 {
42        Some(sorted[n / 2])
43    } else {
44        Some((sorted[n / 2 - 1] + sorted[n / 2]) / 2.0)
45    }
46}
47
48/// Compute quantile using linear interpolation
49pub fn quantile(data: &Array1<f64>, q: f64) -> Option<f64> {
50    if data.is_empty() || !(0.0..=1.0).contains(&q) {
51        return None;
52    }
53
54    let mut sorted: Vec<f64> = data.to_vec();
55    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
56
57    let n = sorted.len() as f64;
58    let index = q * (n - 1.0);
59    let lower = index.floor() as usize;
60    let upper = index.ceil() as usize;
61
62    if lower == upper {
63        Some(sorted[lower])
64    } else {
65        let weight = index - lower as f64;
66        Some(sorted[lower] * (1.0 - weight) + sorted[upper] * weight)
67    }
68}
69
70/// Compute interquartile range (IQR)
71pub fn iqr(data: &Array1<f64>) -> Option<f64> {
72    let q1 = quantile(data, 0.25)?;
73    let q3 = quantile(data, 0.75)?;
74    Some(q3 - q1)
75}
76
77/// Compute skewness (Fisher-Pearson coefficient)
78pub fn skewness(data: &Array1<f64>) -> Option<f64> {
79    let n = data.len() as f64;
80    if n < 3.0 {
81        return None;
82    }
83
84    let m = mean(data)?;
85    let s = std(data, 1.0)?; // sample std
86
87    if s == 0.0 {
88        return Some(0.0);
89    }
90
91    let sum_cubes: f64 = data.iter().map(|&x| ((x - m) / s).powi(3)).sum();
92    Some(sum_cubes / n)
93}
94
95/// Compute kurtosis (Fisher's definition, excess kurtosis)
96pub fn kurtosis(data: &Array1<f64>) -> Option<f64> {
97    let n = data.len() as f64;
98    if n < 4.0 {
99        return None;
100    }
101
102    let m = mean(data)?;
103    let s = std(data, 1.0)?; // sample std
104
105    if s == 0.0 {
106        return Some(0.0);
107    }
108
109    let sum_quarts: f64 = data.iter().map(|&x| ((x - m) / s).powi(4)).sum();
110    Some(sum_quarts / n - 3.0)
111}
112
113/// Compute covariance between two arrays
114pub fn covariance(x: &Array1<f64>, y: &Array1<f64>, ddof: f64) -> Option<f64> {
115    if x.len() != y.len() || x.is_empty() {
116        return None;
117    }
118
119    let n = x.len() as f64;
120    if n <= ddof {
121        return None;
122    }
123
124    let x_mean = mean(x)?;
125    let y_mean = mean(y)?;
126
127    let mut sum = 0.0;
128    for i in 0..x.len() {
129        sum += (x[i] - x_mean) * (y[i] - y_mean);
130    }
131
132    Some(sum / (n - ddof))
133}
134
135/// Compute correlation coefficient (Pearson)
136pub fn correlation(x: &Array1<f64>, y: &Array1<f64>) -> Option<f64> {
137    let cov = covariance(x, y, 1.0)?;
138    let x_std = std(x, 1.0)?;
139    let y_std = std(y, 1.0)?;
140
141    if x_std == 0.0 || y_std == 0.0 {
142        return Some(0.0);
143    }
144
145    Some(cov / (x_std * y_std))
146}
147
148/// Compute summary statistics for an array
149#[derive(Debug, Clone)]
150pub struct SummaryStats {
151    pub count: usize,
152    pub mean: f64,
153    pub std: f64,
154    pub min: f64,
155    pub q25: f64,
156    pub median: f64,
157    pub q75: f64,
158    pub max: f64,
159    pub skewness: f64,
160    pub kurtosis: f64,
161}
162
163impl SummaryStats {
164    /// Compute summary statistics from data
165    pub fn from_data(data: &Array1<f64>) -> Option<Self> {
166        if data.is_empty() {
167            return None;
168        }
169
170        let count = data.len();
171        let mean = mean(data)?;
172        let std = std(data, 1.0)?;
173        let min = data.iter().fold(f64::INFINITY, |a, &b| a.min(b));
174        let max = data.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
175        let q25 = quantile(data, 0.25)?;
176        let median = median(data)?;
177        let q75 = quantile(data, 0.75)?;
178        let skewness = skewness(data).unwrap_or(0.0);
179        let kurtosis = kurtosis(data).unwrap_or(0.0);
180
181        Some(Self {
182            count,
183            mean,
184            std,
185            min,
186            q25,
187            median,
188            q75,
189            max,
190            skewness,
191            kurtosis,
192        })
193    }
194}
195
196/// Compute correlation matrix
197pub fn correlation_matrix(data: &Array2<f64>) -> Result<Array2<f64>> {
198    let (n_samples, n_features) = data.dim();
199
200    if n_samples < 2 {
201        return Err(Error::DataError(
202            "Need at least 2 samples to compute correlation".to_string(),
203        ));
204    }
205
206    let mut corr = Array2::zeros((n_features, n_features));
207
208    for i in 0..n_features {
209        for j in 0..n_features {
210            let x = data.column(i);
211            let y = data.column(j);
212
213            if let Some(c) = correlation(&x.to_owned(), &y.to_owned()) {
214                corr[(i, j)] = c;
215            } else {
216                corr[(i, j)] = 0.0;
217            }
218        }
219    }
220
221    Ok(corr)
222}
223
224/// Compute covariance matrix
225pub fn covariance_matrix(data: &Array2<f64>, ddof: f64) -> Result<Array2<f64>> {
226    let (n_samples, n_features) = data.dim();
227
228    if n_samples as f64 <= ddof {
229        return Err(Error::DataError(format!(
230            "Not enough samples for covariance with ddof={}",
231            ddof
232        )));
233    }
234
235    let mut cov = Array2::zeros((n_features, n_features));
236    let means: Vec<f64> = (0..n_features)
237        .map(|i| data.column(i).mean().unwrap_or(0.0))
238        .collect();
239
240    for i in 0..n_features {
241        for j in 0..=i {
242            let mut sum = 0.0;
243            for k in 0..n_samples {
244                sum += (data[(k, i)] - means[i]) * (data[(k, j)] - means[j]);
245            }
246            let value = sum / (n_samples as f64 - ddof);
247            cov[(i, j)] = value;
248            if i != j {
249                cov[(j, i)] = value;
250            }
251        }
252    }
253
254    Ok(cov)
255}