math_ops/
statistics.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
//! Statistical methods for `Vector<T>`.

use crate::vector::Vector;
use num_traits::{Float, ToPrimitive};
use crate::WrapAsVector;

/// Trait definition for Statistics, generic over type T.
/// T is expected to be a floating-point type like f32 or f64.
pub trait Statistics<T> {
  /// Computes the mean (average) of the data.
  /// Returns an Option<T> where None represents an empty dataset.
  fn mean(&self) -> Option<T>;

  /// Computes the variance of the data.
  /// Returns an Option<T>, where None represents an empty dataset.
  /// Variance is the average of the squared deviations from the mean.
  fn var(&self) -> Option<T>;

  /// Computes the standard deviation of the data.
  /// Returns an Option<T>, where None represents an empty dataset.
  /// Standard deviation is the square root of the variance.
  fn stddev(&self) -> Option<T>;

  /// Computes the median of the data.
  /// Returns an Option<T>, where None represents an empty dataset.
  /// The median is the value separating the higher half from the lower half.
  fn median(&self) -> Option<T>;

  /// Computes the quantile for the given fraction `q`.
  /// `q` is expected to be a floating-point value between 0 and 1.
  /// Returns an Option<T> where None represents an empty dataset.
  /// For example, q = 0.5 gives the median, q = 0.25 gives the 25th percentile.
  fn quantile(&self, q: T) -> Option<T>;

  /// Computes the interquartile range (IQR) of the data.
  /// Returns an Option<T>, where None represents an empty dataset.
  /// IQR is the range between the 25th percentile and 75th percentile.
  fn iqr(&self) -> Option<T>;

  /// Returns the minimum value in the dataset, ignoring NaN values.
  /// Returns an Option<T>, where None represents an empty dataset.
  fn min(&self) -> Option<T>;

  /// Returns the maximum value in the dataset, ignoring NaN values.
  /// Returns an Option<T>, where None represents an empty dataset.
  fn max(&self) -> Option<T>;

  /// Computes the cumulative sum of the data.
  /// Returns a `Vector<T>`, where each element is the cumulative sum up to that index.
  /// NaN values are ignored in the summation.
  fn cumsum(&self) -> Vector<T>;
}

impl<T> Statistics<T> for Vector<T>
where
  T: Float + ToPrimitive + Copy + PartialOrd,
{
  fn mean(&self) -> Option<T> {
    let mut sum = T::zero();
    let mut count = 0;
    for &x in self.iter() {
      if !x.is_nan() {
        sum = sum + x;
        count += 1;
      }
    }
    if count == 0 {
      None
    } else {
      Some(sum / T::from(count).unwrap())
    }
  }

  fn var(&self) -> Option<T> {
    let mean = self.mean()?;
    let mut sum_sq_diff = T::zero();
    let mut count = 0;
    for &x in self.iter() {
      if !x.is_nan() {
        sum_sq_diff = sum_sq_diff + (x - mean) * (x - mean);
        count += 1;
      }
    }
    if count < 2 {
      None
    } else {
      Some(sum_sq_diff / T::from(count).unwrap())
    }
  }

  fn stddev(&self) -> Option<T> {
    self.var().map(|v| v.sqrt())
  }

  fn median(&self) -> Option<T> {
    let mut non_nan_values: Vec<T> = self.iter().cloned().filter(|x| !x.is_nan()).collect();
    let n = non_nan_values.len();
    if n == 0 {
      return None;
    }
    non_nan_values.sort_by(|a, b| a.partial_cmp(b).unwrap());
    let mid = n / 2;
    if n % 2 == 0 {
      Some((non_nan_values[mid - 1] + non_nan_values[mid]) / T::from(2.0).unwrap())
    } else {
      Some(non_nan_values[mid])
    }
  }

  fn quantile(&self, q: T) -> Option<T> {
    if q < T::zero() || q > T::one() {
      return None;
    }
    let mut non_nan_values: Vec<T> = self.iter().cloned().filter(|x| !x.is_nan()).collect();
    let n = non_nan_values.len();
    if n == 0 {
      return None;
    }
    non_nan_values.sort_by(|a, b| a.partial_cmp(b).unwrap());
    let pos = q * T::from(n - 1).unwrap();
    let pos_floor = pos.floor();
    let pos_ceil = pos.ceil();
    let weight = pos - pos_floor;
    let idx_floor = pos_floor.to_usize()?;
    let idx_ceil = pos_ceil.to_usize()?;
    if idx_floor == idx_ceil {
      Some(non_nan_values[idx_floor])
    } else {
      Some(
        non_nan_values[idx_floor]
          + (non_nan_values[idx_ceil] - non_nan_values[idx_floor]) * weight,
      )
    }
  }

  fn iqr(&self) -> Option<T> {
    let q75 = self.quantile(T::from(0.75).unwrap())?;
    let q25 = self.quantile(T::from(0.25).unwrap())?;
    Some(q75 - q25)
  }

  fn min(&self) -> Option<T> {
    self.iter()
      .cloned()
      .filter(|x| !x.is_nan())
      .min_by(|a, b| a.partial_cmp(b).unwrap())
  }

  fn max(&self) -> Option<T> {
    self.iter()
      .cloned()
      .filter(|x| !x.is_nan())
      .max_by(|a, b| a.partial_cmp(b).unwrap())
  }

  fn cumsum(&self) -> Vector<T> {
    let mut cum_sum = T::zero();
    let mut result = Vec::with_capacity(self.len());
    for &x in self.iter() {
      if !x.is_nan() {
        cum_sum = cum_sum + x;
      }
      result.push(cum_sum);
    }
    result.wrap_as_vector()
  }
}