tenflowers_dataset/statistics/
advanced.rs1use crate::Dataset;
7use tenflowers_core::{Result, TensorError};
8
9pub struct AdvancedStatistics;
14
15impl AdvancedStatistics {
16 pub fn compute_multivariate_stats<T, D>(dataset: &D) -> Result<MultivariateStatistics<T>>
24 where
25 T: Clone
26 + Default
27 + scirs2_core::numeric::Float
28 + Send
29 + Sync
30 + bytemuck::Pod
31 + bytemuck::Zeroable
32 + 'static,
33 D: Dataset<T>,
34 {
35 let sample_count = dataset.len();
36 if sample_count == 0 {
37 return Err(TensorError::InvalidShape {
38 operation: "AdvancedStatistics::compute_multivariate_stats".to_string(),
39 reason: "Empty dataset".to_string(),
40 shape: Some(vec![0]),
41 context: None,
42 });
43 }
44
45 let (first_features, _) = dataset.get(0)?;
47 let feature_dims = first_features.shape().dims();
48 let n_features = if feature_dims.is_empty() {
49 1
50 } else {
51 feature_dims.iter().product()
52 };
53
54 let mut all_features = Vec::with_capacity(sample_count * n_features);
55
56 for i in 0..sample_count {
58 let (features, _) = dataset.get(i)?;
59 let feature_vec = features.to_vec()?;
60 all_features.extend_from_slice(&feature_vec);
61 }
62
63 let covariance_matrix =
65 Self::compute_covariance_matrix(&all_features, n_features, sample_count)?;
66 let eigenvalues = Self::compute_eigenvalues(&covariance_matrix)?;
67 let skewness = Self::compute_skewness(&all_features, n_features, sample_count)?;
68 let kurtosis = Self::compute_kurtosis(&all_features, n_features, sample_count)?;
69
70 Ok(MultivariateStatistics {
71 covariance_matrix,
72 eigenvalues,
73 skewness,
74 kurtosis,
75 n_features,
76 n_samples: sample_count,
77 })
78 }
79
80 fn compute_covariance_matrix<T>(
82 data: &[T],
83 n_features: usize,
84 n_samples: usize,
85 ) -> Result<Vec<Vec<T>>>
86 where
87 T: Clone + Default + scirs2_core::numeric::Float,
88 {
89 let mut means = vec![T::zero(); n_features];
91 for sample_idx in 0..n_samples {
92 for feat_idx in 0..n_features {
93 let data_idx = sample_idx * n_features + feat_idx;
94 means[feat_idx] = means[feat_idx] + data[data_idx].clone();
95 }
96 }
97
98 for mean in &mut means {
99 *mean =
100 mean.clone() / T::from(n_samples).expect("sample count should convert to float");
101 }
102
103 let mut cov_matrix = vec![vec![T::zero(); n_features]; n_features];
105
106 for i in 0..n_features {
107 for j in 0..n_features {
108 let mut covariance = T::zero();
109
110 for sample_idx in 0..n_samples {
111 let i_idx = sample_idx * n_features + i;
112 let j_idx = sample_idx * n_features + j;
113
114 let dev_i = data[i_idx].clone() - means[i].clone();
115 let dev_j = data[j_idx].clone() - means[j].clone();
116
117 covariance = covariance + dev_i * dev_j;
118 }
119
120 cov_matrix[i][j] = covariance
121 / T::from(n_samples - 1).expect("sample count should convert to float");
122 }
123 }
124
125 Ok(cov_matrix)
126 }
127
128 fn compute_eigenvalues<T>(matrix: &[Vec<T>]) -> Result<Vec<T>>
130 where
131 T: Clone + Default + scirs2_core::numeric::Float,
132 {
133 let n = matrix.len();
134 let mut eigenvalues = Vec::new();
135
136 for i in 0..n {
139 eigenvalues.push(matrix[i][i].clone());
140 }
141
142 Ok(eigenvalues)
143 }
144
145 fn compute_skewness<T>(data: &[T], n_features: usize, n_samples: usize) -> Result<Vec<T>>
147 where
148 T: Clone + Default + scirs2_core::numeric::Float,
149 {
150 let mut skewness_values = vec![T::zero(); n_features];
151
152 for feat_idx in 0..n_features {
153 let mut feature_values = Vec::with_capacity(n_samples);
155 for sample_idx in 0..n_samples {
156 let data_idx = sample_idx * n_features + feat_idx;
157 feature_values.push(data[data_idx].clone());
158 }
159
160 let mean = feature_values.iter().fold(T::zero(), |acc, &x| acc + x)
162 / T::from(n_samples).expect("sample count should convert to float");
163
164 let variance = feature_values
166 .iter()
167 .map(|&x| {
168 let dev = x - mean.clone();
169 dev.clone() * dev
170 })
171 .fold(T::zero(), |acc, x| acc + x)
172 / T::from(n_samples).expect("sample count should convert to float");
173
174 let std_dev = variance.sqrt();
175
176 if std_dev > T::zero() {
177 let skew = feature_values
179 .iter()
180 .map(|&x| {
181 let normalized = (x - mean.clone()) / std_dev.clone();
182 normalized.clone() * normalized.clone() * normalized
183 })
184 .fold(T::zero(), |acc, x| acc + x)
185 / T::from(n_samples).expect("sample count should convert to float");
186
187 skewness_values[feat_idx] = skew;
188 }
189 }
190
191 Ok(skewness_values)
192 }
193
194 fn compute_kurtosis<T>(data: &[T], n_features: usize, n_samples: usize) -> Result<Vec<T>>
196 where
197 T: Clone + Default + scirs2_core::numeric::Float,
198 {
199 let mut kurtosis_values = vec![T::zero(); n_features];
200
201 for feat_idx in 0..n_features {
202 let mut feature_values = Vec::with_capacity(n_samples);
204 for sample_idx in 0..n_samples {
205 let data_idx = sample_idx * n_features + feat_idx;
206 feature_values.push(data[data_idx].clone());
207 }
208
209 let mean = feature_values.iter().fold(T::zero(), |acc, &x| acc + x)
211 / T::from(n_samples).expect("sample count should convert to float");
212
213 let variance = feature_values
215 .iter()
216 .map(|&x| {
217 let dev = x - mean.clone();
218 dev.clone() * dev
219 })
220 .fold(T::zero(), |acc, x| acc + x)
221 / T::from(n_samples).expect("sample count should convert to float");
222
223 let std_dev = variance.sqrt();
224
225 if std_dev > T::zero() {
226 let kurt = feature_values
228 .iter()
229 .map(|&x| {
230 let normalized = (x - mean.clone()) / std_dev.clone();
231 let norm_squared = normalized.clone() * normalized.clone();
232 norm_squared.clone() * norm_squared
233 })
234 .fold(T::zero(), |acc, x| acc + x)
235 / T::from(n_samples).expect("sample count should convert to float");
236
237 kurtosis_values[feat_idx] =
239 kurt - T::from(3.0).expect("constant 3.0 should convert to float");
240 }
241 }
242
243 Ok(kurtosis_values)
244 }
245
246 pub fn compute_pca<T, D>(dataset: &D, n_components: usize) -> Result<PCAResult<T>>
248 where
249 T: Clone
250 + Default
251 + scirs2_core::numeric::Float
252 + Send
253 + Sync
254 + bytemuck::Pod
255 + bytemuck::Zeroable
256 + 'static,
257 D: Dataset<T>,
258 {
259 let sample_count = dataset.len();
260 if sample_count == 0 {
261 return Err(TensorError::InvalidShape {
262 operation: "AdvancedStatistics::compute_pca".to_string(),
263 reason: "Empty dataset".to_string(),
264 shape: Some(vec![0]),
265 context: None,
266 });
267 }
268
269 let multivariate_stats: MultivariateStatistics<T> =
271 Self::compute_multivariate_stats(dataset)?;
272
273 let eigenvalues = multivariate_stats.eigenvalues;
275 let n_features = multivariate_stats.n_features;
276
277 let selected_components = std::cmp::min(n_components, eigenvalues.len());
279 let mut sorted_eigenvalues = eigenvalues.clone();
280
281 sorted_eigenvalues.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
283
284 let principal_components = sorted_eigenvalues[..selected_components].to_vec();
285 let explained_variance_ratio = Self::compute_explained_variance(&principal_components)?;
286
287 Ok(PCAResult {
288 principal_components,
289 explained_variance_ratio,
290 n_components: selected_components,
291 n_features,
292 })
293 }
294
295 fn compute_explained_variance<T>(eigenvalues: &[T]) -> Result<Vec<T>>
297 where
298 T: Clone + Default + scirs2_core::numeric::Float,
299 {
300 let total_variance = eigenvalues.iter().fold(T::zero(), |acc, &x| acc + x);
301
302 if total_variance <= T::zero() {
303 return Ok(vec![T::zero(); eigenvalues.len()]);
304 }
305
306 let ratios = eigenvalues
307 .iter()
308 .map(|&x| x / total_variance.clone())
309 .collect();
310
311 Ok(ratios)
312 }
313}
314
315#[derive(Debug, Clone)]
317pub struct MultivariateStatistics<T> {
318 pub covariance_matrix: Vec<Vec<T>>,
319 pub eigenvalues: Vec<T>,
320 pub skewness: Vec<T>,
321 pub kurtosis: Vec<T>,
322 pub n_features: usize,
323 pub n_samples: usize,
324}
325
326#[derive(Debug, Clone)]
328pub struct PCAResult<T> {
329 pub principal_components: Vec<T>,
330 pub explained_variance_ratio: Vec<T>,
331 pub n_components: usize,
332 pub n_features: usize,
333}
334
335pub trait AdvancedStatisticsExt<T> {
337 fn compute_multivariate_statistics(&self) -> Result<MultivariateStatistics<T>>;
339
340 fn compute_pca(&self, n_components: usize) -> Result<PCAResult<T>>;
342}
343
344impl<T, D> AdvancedStatisticsExt<T> for D
345where
346 T: Clone
347 + Default
348 + scirs2_core::numeric::Float
349 + Send
350 + Sync
351 + bytemuck::Pod
352 + bytemuck::Zeroable
353 + 'static,
354 D: Dataset<T>,
355{
356 fn compute_multivariate_statistics(&self) -> Result<MultivariateStatistics<T>> {
357 AdvancedStatistics::compute_multivariate_stats(self)
358 }
359
360 fn compute_pca(&self, n_components: usize) -> Result<PCAResult<T>> {
361 AdvancedStatistics::compute_pca(self, n_components)
362 }
363}