1use ndarray::{Array1, Array2};
4use so_core::error::{Error, Result};
5
6pub fn mean(data: &Array1<f64>) -> Option<f64> {
8 if data.is_empty() {
9 return None;
10 }
11 Some(data.sum() / data.len() as f64)
12}
13
14pub fn variance(data: &Array1<f64>, ddof: f64) -> Option<f64> {
16 let n = data.len() as f64;
17 if n <= ddof {
18 return None;
19 }
20
21 let m = mean(data)?;
22 let sum_sq_diff: f64 = data.iter().map(|&x| (x - m).powi(2)).sum();
23 Some(sum_sq_diff / (n - ddof))
24}
25
26pub fn std(data: &Array1<f64>, ddof: f64) -> Option<f64> {
28 variance(data, ddof).map(|v| v.sqrt())
29}
30
31pub fn median(data: &Array1<f64>) -> Option<f64> {
33 if data.is_empty() {
34 return None;
35 }
36
37 let mut sorted: Vec<f64> = data.to_vec();
38 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
39
40 let n = sorted.len();
41 if n % 2 == 1 {
42 Some(sorted[n / 2])
43 } else {
44 Some((sorted[n / 2 - 1] + sorted[n / 2]) / 2.0)
45 }
46}
47
48pub fn quantile(data: &Array1<f64>, q: f64) -> Option<f64> {
50 if data.is_empty() || !(0.0..=1.0).contains(&q) {
51 return None;
52 }
53
54 let mut sorted: Vec<f64> = data.to_vec();
55 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
56
57 let n = sorted.len() as f64;
58 let index = q * (n - 1.0);
59 let lower = index.floor() as usize;
60 let upper = index.ceil() as usize;
61
62 if lower == upper {
63 Some(sorted[lower])
64 } else {
65 let weight = index - lower as f64;
66 Some(sorted[lower] * (1.0 - weight) + sorted[upper] * weight)
67 }
68}
69
70pub fn iqr(data: &Array1<f64>) -> Option<f64> {
72 let q1 = quantile(data, 0.25)?;
73 let q3 = quantile(data, 0.75)?;
74 Some(q3 - q1)
75}
76
77pub fn skewness(data: &Array1<f64>) -> Option<f64> {
79 let n = data.len() as f64;
80 if n < 3.0 {
81 return None;
82 }
83
84 let m = mean(data)?;
85 let s = std(data, 1.0)?; if s == 0.0 {
88 return Some(0.0);
89 }
90
91 let sum_cubes: f64 = data.iter().map(|&x| ((x - m) / s).powi(3)).sum();
92 Some(sum_cubes / n)
93}
94
95pub fn kurtosis(data: &Array1<f64>) -> Option<f64> {
97 let n = data.len() as f64;
98 if n < 4.0 {
99 return None;
100 }
101
102 let m = mean(data)?;
103 let s = std(data, 1.0)?; if s == 0.0 {
106 return Some(0.0);
107 }
108
109 let sum_quarts: f64 = data.iter().map(|&x| ((x - m) / s).powi(4)).sum();
110 Some(sum_quarts / n - 3.0)
111}
112
113pub fn covariance(x: &Array1<f64>, y: &Array1<f64>, ddof: f64) -> Option<f64> {
115 if x.len() != y.len() || x.is_empty() {
116 return None;
117 }
118
119 let n = x.len() as f64;
120 if n <= ddof {
121 return None;
122 }
123
124 let x_mean = mean(x)?;
125 let y_mean = mean(y)?;
126
127 let mut sum = 0.0;
128 for i in 0..x.len() {
129 sum += (x[i] - x_mean) * (y[i] - y_mean);
130 }
131
132 Some(sum / (n - ddof))
133}
134
135pub fn correlation(x: &Array1<f64>, y: &Array1<f64>) -> Option<f64> {
137 let cov = covariance(x, y, 1.0)?;
138 let x_std = std(x, 1.0)?;
139 let y_std = std(y, 1.0)?;
140
141 if x_std == 0.0 || y_std == 0.0 {
142 return Some(0.0);
143 }
144
145 Some(cov / (x_std * y_std))
146}
147
148#[derive(Debug, Clone)]
150pub struct SummaryStats {
151 pub count: usize,
152 pub mean: f64,
153 pub std: f64,
154 pub min: f64,
155 pub q25: f64,
156 pub median: f64,
157 pub q75: f64,
158 pub max: f64,
159 pub skewness: f64,
160 pub kurtosis: f64,
161}
162
163impl SummaryStats {
164 pub fn from_data(data: &Array1<f64>) -> Option<Self> {
166 if data.is_empty() {
167 return None;
168 }
169
170 let count = data.len();
171 let mean = mean(data)?;
172 let std = std(data, 1.0)?;
173 let min = data.iter().fold(f64::INFINITY, |a, &b| a.min(b));
174 let max = data.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
175 let q25 = quantile(data, 0.25)?;
176 let median = median(data)?;
177 let q75 = quantile(data, 0.75)?;
178 let skewness = skewness(data).unwrap_or(0.0);
179 let kurtosis = kurtosis(data).unwrap_or(0.0);
180
181 Some(Self {
182 count,
183 mean,
184 std,
185 min,
186 q25,
187 median,
188 q75,
189 max,
190 skewness,
191 kurtosis,
192 })
193 }
194}
195
196pub fn correlation_matrix(data: &Array2<f64>) -> Result<Array2<f64>> {
198 let (n_samples, n_features) = data.dim();
199
200 if n_samples < 2 {
201 return Err(Error::DataError(
202 "Need at least 2 samples to compute correlation".to_string(),
203 ));
204 }
205
206 let mut corr = Array2::zeros((n_features, n_features));
207
208 for i in 0..n_features {
209 for j in 0..n_features {
210 let x = data.column(i);
211 let y = data.column(j);
212
213 if let Some(c) = correlation(&x.to_owned(), &y.to_owned()) {
214 corr[(i, j)] = c;
215 } else {
216 corr[(i, j)] = 0.0;
217 }
218 }
219 }
220
221 Ok(corr)
222}
223
224pub fn covariance_matrix(data: &Array2<f64>, ddof: f64) -> Result<Array2<f64>> {
226 let (n_samples, n_features) = data.dim();
227
228 if n_samples as f64 <= ddof {
229 return Err(Error::DataError(format!(
230 "Not enough samples for covariance with ddof={}",
231 ddof
232 )));
233 }
234
235 let mut cov = Array2::zeros((n_features, n_features));
236 let means: Vec<f64> = (0..n_features)
237 .map(|i| data.column(i).mean().unwrap_or(0.0))
238 .collect();
239
240 for i in 0..n_features {
241 for j in 0..=i {
242 let mut sum = 0.0;
243 for k in 0..n_samples {
244 sum += (data[(k, i)] - means[i]) * (data[(k, j)] - means[j]);
245 }
246 let value = sum / (n_samples as f64 - ddof);
247 cov[(i, j)] = value;
248 if i != j {
249 cov[(j, i)] = value;
250 }
251 }
252 }
253
254 Ok(cov)
255}