sklears_utils/array_utils/
stats.rs

1//! Statistical functions for arrays
2
3use crate::{UtilsError, UtilsResult};
4use scirs2_core::ndarray::{Array1, ArrayView1};
5use scirs2_core::numeric::{Float, ToPrimitive};
6
7/// Statistics summary for an array
8#[derive(Debug, Clone)]
9pub struct ArrayStatistics<T> {
10    pub mean: T,
11    pub std: T,
12    pub min: T,
13    pub max: T,
14    pub median: T,
15    pub count: usize,
16}
17
18/// Calculate array sum
19pub fn array_sum<T>(array: &Array1<T>) -> UtilsResult<T>
20where
21    T: Float + Clone,
22{
23    if array.is_empty() {
24        return Err(UtilsError::EmptyInput);
25    }
26
27    let sum = array.iter().fold(T::zero(), |acc, &x| acc + x);
28    Ok(sum)
29}
30
31/// Calculate array mean
32pub fn array_mean<T>(array: &Array1<T>) -> UtilsResult<T>
33where
34    T: Float + Clone + ToPrimitive,
35{
36    if array.is_empty() {
37        return Err(UtilsError::EmptyInput);
38    }
39
40    let sum = array.iter().fold(T::zero(), |acc, &x| acc + x);
41    let count = T::from(array.len()).unwrap();
42    Ok(sum / count)
43}
44
45/// Specialized f64 mean with better numerical stability
46pub fn array_mean_f64(array: &ArrayView1<f64>) -> UtilsResult<f64> {
47    if array.is_empty() {
48        return Err(UtilsError::EmptyInput);
49    }
50
51    // Use Kahan summation for better numerical stability
52    let mut sum = 0.0;
53    let mut c = 0.0; // Compensation for lost low-order bits
54
55    for &value in array {
56        let y = value - c;
57        let t = sum + y;
58        c = (t - sum) - y;
59        sum = t;
60    }
61
62    Ok(sum / array.len() as f64)
63}
64
65/// Calculate array variance
66pub fn array_var<T>(array: &Array1<T>) -> UtilsResult<T>
67where
68    T: Float + Clone + ToPrimitive,
69{
70    if array.len() <= 1 {
71        return Err(UtilsError::InsufficientData {
72            min: 2,
73            actual: array.len(),
74        });
75    }
76
77    let mean = array_mean(array)?;
78    let variance = array
79        .iter()
80        .map(|&x| {
81            let diff = x - mean;
82            diff * diff
83        })
84        .fold(T::zero(), |acc, x| acc + x);
85
86    let n_minus_1 = T::from(array.len() - 1).unwrap();
87    Ok(variance / n_minus_1)
88}
89
90/// Specialized f64 variance with better numerical stability
91pub fn array_variance_f64(array: &ArrayView1<f64>) -> UtilsResult<f64> {
92    if array.len() <= 1 {
93        return Err(UtilsError::InsufficientData {
94            min: 2,
95            actual: array.len(),
96        });
97    }
98
99    let mean = array_mean_f64(array)?;
100
101    // Two-pass algorithm for better numerical stability
102    let mut sum_sq_diff = 0.0;
103    let mut correction = 0.0; // Compensation for lost low-order bits
104
105    for &value in array {
106        let diff = value - mean;
107        let sq_diff = diff * diff;
108        let y = sq_diff - correction;
109        let t = sum_sq_diff + y;
110        correction = (t - sum_sq_diff) - y;
111        sum_sq_diff = t;
112    }
113
114    Ok(sum_sq_diff / (array.len() - 1) as f64)
115}
116
117/// Calculate array standard deviation
118pub fn array_std<T>(array: &Array1<T>) -> UtilsResult<T>
119where
120    T: Float + Clone + ToPrimitive,
121{
122    let variance = array_var(array)?;
123    Ok(variance.sqrt())
124}
125
126/// Find minimum value
127pub fn array_min<T>(array: &Array1<T>) -> UtilsResult<T>
128where
129    T: PartialOrd + Clone,
130{
131    if array.is_empty() {
132        return Err(UtilsError::EmptyInput);
133    }
134
135    let mut min_val = &array[0];
136    for val in array.iter().skip(1) {
137        if val < min_val {
138            min_val = val;
139        }
140    }
141    Ok(min_val.clone())
142}
143
144/// Find maximum value
145pub fn array_max<T>(array: &Array1<T>) -> UtilsResult<T>
146where
147    T: PartialOrd + Clone,
148{
149    if array.is_empty() {
150        return Err(UtilsError::EmptyInput);
151    }
152
153    let mut max_val = &array[0];
154    for val in array.iter().skip(1) {
155        if val > max_val {
156            max_val = val;
157        }
158    }
159    Ok(max_val.clone())
160}
161
162/// Find both min and max values efficiently
163pub fn array_min_max<T>(array: &Array1<T>) -> UtilsResult<(T, T)>
164where
165    T: PartialOrd + Clone,
166{
167    if array.is_empty() {
168        return Err(UtilsError::EmptyInput);
169    }
170
171    let mut min_val = &array[0];
172    let mut max_val = &array[0];
173
174    for val in array.iter().skip(1) {
175        if val < min_val {
176            min_val = val;
177        }
178        if val > max_val {
179            max_val = val;
180        }
181    }
182
183    Ok((min_val.clone(), max_val.clone()))
184}
185
186/// Calculate median
187pub fn array_median<T>(array: &Array1<T>) -> UtilsResult<T>
188where
189    T: PartialOrd + Clone + Float,
190{
191    if array.is_empty() {
192        return Err(UtilsError::EmptyInput);
193    }
194
195    let mut sorted: Vec<T> = array.iter().cloned().collect();
196    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
197
198    let len = sorted.len();
199    if len % 2 == 1 {
200        Ok(sorted[len / 2])
201    } else {
202        let mid = len / 2;
203        let sum = sorted[mid - 1] + sorted[mid];
204        Ok(sum / T::from(2.0).unwrap())
205    }
206}
207
208/// Calculate percentile
209pub fn array_percentile<T>(array: &Array1<T>, percentile: f64) -> UtilsResult<T>
210where
211    T: PartialOrd + Clone + Float,
212{
213    if array.is_empty() {
214        return Err(UtilsError::EmptyInput);
215    }
216
217    if !(0.0..=100.0).contains(&percentile) {
218        return Err(UtilsError::InvalidParameter(
219            "Percentile must be between 0 and 100".to_string(),
220        ));
221    }
222
223    let mut sorted: Vec<T> = array.iter().cloned().collect();
224    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
225
226    let len = sorted.len() as f64;
227    let index = (percentile / 100.0) * (len - 1.0);
228    let lower_index = index.floor() as usize;
229    let upper_index = index.ceil() as usize;
230
231    if lower_index == upper_index {
232        Ok(sorted[lower_index])
233    } else {
234        let weight = T::from(index - lower_index as f64).unwrap();
235        let lower_val = sorted[lower_index];
236        let upper_val = sorted[upper_index];
237        Ok(lower_val + weight * (upper_val - lower_val))
238    }
239}
240
241/// Calculate multiple quantiles efficiently
242pub fn array_quantiles<T>(array: &Array1<T>, quantiles: &[f64]) -> UtilsResult<Vec<T>>
243where
244    T: PartialOrd + Clone + Float,
245{
246    if array.is_empty() {
247        return Err(UtilsError::EmptyInput);
248    }
249
250    let mut sorted: Vec<T> = array.iter().cloned().collect();
251    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
252
253    let mut results = Vec::with_capacity(quantiles.len());
254    let len = sorted.len() as f64;
255
256    for &q in quantiles {
257        if !(0.0..=1.0).contains(&q) {
258            return Err(UtilsError::InvalidParameter(
259                "Quantiles must be between 0 and 1".to_string(),
260            ));
261        }
262
263        let index = q * (len - 1.0);
264        let lower_index = index.floor() as usize;
265        let upper_index = index.ceil() as usize;
266
267        let result = if lower_index == upper_index {
268            sorted[lower_index]
269        } else {
270            let weight = T::from(index - lower_index as f64).unwrap();
271            let lower_val = sorted[lower_index];
272            let upper_val = sorted[upper_index];
273            lower_val + weight * (upper_val - lower_val)
274        };
275
276        results.push(result);
277    }
278
279    Ok(results)
280}
281
282/// Cumulative sum
283pub fn array_cumsum<T>(array: &Array1<T>) -> UtilsResult<Array1<T>>
284where
285    T: Float + Clone,
286{
287    if array.is_empty() {
288        return Err(UtilsError::EmptyInput);
289    }
290
291    let mut result = Vec::with_capacity(array.len());
292    let mut cumulative = T::zero();
293
294    for &value in array {
295        cumulative = cumulative + value;
296        result.push(cumulative);
297    }
298
299    Ok(Array1::from_vec(result))
300}
301
302/// Standardize array to zero mean and unit variance
303pub fn array_standardize<T>(array: &Array1<T>) -> UtilsResult<Array1<T>>
304where
305    T: Float + Clone + ToPrimitive,
306{
307    if array.len() <= 1 {
308        return Err(UtilsError::InsufficientData {
309            min: 2,
310            actual: array.len(),
311        });
312    }
313
314    let mean = array_mean(array)?;
315    let std = array_std(array)?;
316
317    if std.abs() < T::from(1e-10).unwrap() {
318        return Err(UtilsError::InvalidParameter(
319            "Standard deviation too small for standardization".to_string(),
320        ));
321    }
322
323    let standardized = array.mapv(|x| (x - mean) / std);
324    Ok(standardized)
325}
326
327/// Normalize to [0, 1] range
328pub fn array_min_max_normalize<T>(array: &Array1<T>) -> UtilsResult<Array1<T>>
329where
330    T: PartialOrd + Float + Clone,
331{
332    if array.is_empty() {
333        return Err(UtilsError::EmptyInput);
334    }
335
336    let (min_val, max_val) = array_min_max(array)?;
337    let range = max_val - min_val;
338
339    if range.abs() < T::from(1e-10).unwrap() {
340        // All values are the same, return zeros
341        return Ok(Array1::zeros(array.len()));
342    }
343
344    let normalized = array.mapv(|x| (x - min_val) / range);
345    Ok(normalized)
346}
347
348/// Comprehensive array statistics
349pub fn array_describe<T>(array: &Array1<T>) -> UtilsResult<ArrayStatistics<T>>
350where
351    T: Float + Clone + ToPrimitive + PartialOrd,
352{
353    if array.is_empty() {
354        return Err(UtilsError::EmptyInput);
355    }
356
357    Ok(ArrayStatistics {
358        mean: array_mean(array)?,
359        std: array_std(array)?,
360        min: array_min(array)?,
361        max: array_max(array)?,
362        median: array_median(array)?,
363        count: array.len(),
364    })
365}