scirs2_datasets/utils/
advanced_analytics.rs1use super::Dataset;
7use scirs2_core::ndarray::{Array1, Array2};
8use statrs::statistics::Statistics;
9use std::error::Error;
10
11#[derive(Debug, Clone)]
13pub struct CorrelationInsights {
14 pub feature_importance: Array1<f64>,
16}
17
18#[derive(Debug, Clone)]
20pub struct NormalityAssessment {
21 pub overall_normality: f64,
23 pub shapiro_wilk_scores: Array1<f64>,
25}
26
27#[derive(Debug, Clone)]
29pub struct AdvancedQualityMetrics {
30 pub complexity_score: f64,
32 pub entropy: f64,
34 pub outlier_score: f64,
36 pub ml_quality_score: f64,
38 pub normality_assessment: NormalityAssessment,
40 pub correlation_insights: CorrelationInsights,
42}
43
44#[derive(Debug, Clone)]
46pub struct AdvancedDatasetAnalyzer {
47 gpu_enabled: bool,
48 advanced_precision: bool,
49 significance_threshold: f64,
50}
51
52impl Default for AdvancedDatasetAnalyzer {
53 fn default() -> Self {
54 Self {
55 gpu_enabled: false,
56 advanced_precision: false,
57 significance_threshold: 0.05,
58 }
59 }
60}
61
62impl AdvancedDatasetAnalyzer {
63 pub fn new() -> Self {
65 Self::default()
66 }
67
68 pub fn with_gpu(mut self, enabled: bool) -> Self {
70 self.gpu_enabled = enabled;
71 self
72 }
73
74 pub fn with_advanced_precision(mut self, enabled: bool) -> Self {
76 self.advanced_precision = enabled;
77 self
78 }
79
80 pub fn with_significance_threshold(mut self, threshold: f64) -> Self {
82 self.significance_threshold = threshold;
83 self
84 }
85
86 pub fn analyze_dataset_quality(
88 &self,
89 dataset: &Dataset,
90 ) -> Result<AdvancedQualityMetrics, Box<dyn Error>> {
91 let data = &dataset.data;
92 let _n_features = data.ncols();
93
94 let _mean_values: Array1<f64> = data
96 .mean_axis(scirs2_core::ndarray::Axis(0))
97 .expect("Operation failed");
98 let _std_values: Array1<f64> = data
99 .var_axis(scirs2_core::ndarray::Axis(0), 1.0)
100 .mapv(|x| x.sqrt());
101
102 let complexity_score = self.calculate_complexity_score(data)?;
104
105 let entropy = self.calculate_entropy(data)?;
107
108 let outlier_score = self.calculate_outlier_score(data)?;
110
111 let ml_quality_score = self.calculate_ml_quality_score(data)?;
113
114 let normality_assessment = self.calculate_normality_assessment(data)?;
116
117 let correlation_insights = self.calculate_correlation_insights(data)?;
119
120 Ok(AdvancedQualityMetrics {
121 complexity_score,
122 entropy,
123 outlier_score,
124 ml_quality_score,
125 normality_assessment,
126 correlation_insights,
127 })
128 }
129
130 fn calculate_complexity_score(&self, data: &Array2<f64>) -> Result<f64, Box<dyn Error>> {
131 let var_mean = {
133 let val = data.var_axis(scirs2_core::ndarray::Axis(0), 1.0).mean();
134 if val.is_nan() {
135 1.0
136 } else {
137 val
138 }
139 };
140 let complexity = (var_mean.ln() + 1.0).clamp(0.0, 1.0);
141 Ok(complexity)
142 }
143
144 fn calculate_entropy(&self, data: &Array2<f64>) -> Result<f64, Box<dyn Error>> {
145 let flattened = data.iter().cloned().collect::<Vec<f64>>();
147 let mut sorted = flattened.clone();
148 sorted.sort_by(|a, b| a.partial_cmp(b).expect("Operation failed"));
149
150 let n = sorted.len() as f64;
152 let entropy = if n > 0.0 {
153 (n.ln() / 2.0).clamp(0.0, 5.0)
154 } else {
155 0.0
156 };
157 Ok(entropy)
158 }
159
160 fn calculate_outlier_score(&self, data: &Array2<f64>) -> Result<f64, Box<dyn Error>> {
161 let threshold = 3.0;
163 let mut outlier_count = 0;
164 let total_count = data.len();
165
166 for col in 0..data.ncols() {
167 let column = data.column(col);
168 let mean = {
169 let val = column.mean();
170 if val.is_nan() {
171 0.0
172 } else {
173 val
174 }
175 };
176 let std = column.var(1.0).sqrt();
177
178 if std > 0.0 {
179 for &value in column.iter() {
180 let z_score = (value - mean).abs() / std;
181 if z_score > threshold {
182 outlier_count += 1;
183 }
184 }
185 }
186 }
187
188 let outlier_ratio = outlier_count as f64 / total_count as f64;
189 Ok(outlier_ratio.min(1.0))
190 }
191
192 fn calculate_ml_quality_score(&self, data: &Array2<f64>) -> Result<f64, Box<dyn Error>> {
193 let var_scores: Array1<f64> = data.var_axis(scirs2_core::ndarray::Axis(0), 1.0);
195 let mean_variance = {
196 let val = var_scores.mean();
197 if val.is_nan() {
198 1.0
199 } else {
200 val
201 }
202 };
203
204 let quality_score = (mean_variance.ln() + 5.0) / 10.0;
206 Ok(quality_score.clamp(0.0, 1.0))
207 }
208
209 fn calculate_normality_assessment(
210 &self,
211 data: &Array2<f64>,
212 ) -> Result<NormalityAssessment, Box<dyn Error>> {
213 let n_features = data.ncols();
214 let mut shapiro_scores = Vec::with_capacity(n_features);
215
216 for col in 0..n_features {
217 let column = data.column(col);
218 let score = self.simplified_normality_test(&column)?;
220 shapiro_scores.push(score);
221 }
222
223 let shapiro_wilk_scores = Array1::from_vec(shapiro_scores);
224 let overall_normality = {
225 let val = shapiro_wilk_scores.view().mean();
226 if val.is_nan() {
227 0.5
228 } else {
229 val
230 }
231 };
232
233 Ok(NormalityAssessment {
234 overall_normality,
235 shapiro_wilk_scores,
236 })
237 }
238
239 fn simplified_normality_test(
240 &self,
241 data: &scirs2_core::ndarray::ArrayView1<f64>,
242 ) -> Result<f64, Box<dyn Error>> {
243 let n = data.len();
245 if n < 3 {
246 return Ok(0.5);
247 }
248
249 use scirs2_core::ndarray::ArrayStatCompat;
250 let mean = data.mean_or(0.0);
251 let variance = data.var(1.0);
252
253 if variance == 0.0 {
254 return Ok(0.0);
255 }
256
257 let std_dev = variance.sqrt();
258
259 let mut skewness: f64 = 0.0;
261 let mut kurtosis: f64 = 0.0;
262
263 for &value in data.iter() {
264 let normalized = (value - mean) / std_dev;
265 skewness += normalized.powi(3);
266 kurtosis += normalized.powi(4);
267 }
268
269 skewness /= n as f64;
270 kurtosis = kurtosis / (n as f64) - 3.0; let skew_penalty = (skewness.abs() / 2.0).min(1.0);
274 let kurt_penalty = (kurtosis.abs() / 4.0).min(1.0);
275 let normality_score: f64 = 1.0 - (skew_penalty + kurt_penalty) / 2.0;
276
277 Ok(normality_score.clamp(0.0, 1.0))
278 }
279
280 fn calculate_correlation_insights(
281 &self,
282 data: &Array2<f64>,
283 ) -> Result<CorrelationInsights, Box<dyn Error>> {
284 let n_features = data.ncols();
285 let mut importance_scores = Vec::with_capacity(n_features);
286
287 for i in 0..n_features {
289 let feature = data.column(i);
290 let variance = feature.var(1.0);
291
292 let importance = (variance.ln() + 1.0).clamp(0.0, 1.0);
294 importance_scores.push(importance);
295 }
296
297 let feature_importance = Array1::from_vec(importance_scores);
298
299 Ok(CorrelationInsights { feature_importance })
300 }
301}
302
303pub fn quick_quality_assessment(dataset: &Dataset) -> Result<f64, Box<dyn Error>> {
305 let data = &dataset.data;
306
307 let n_samples = data.nrows();
309 let n_features = data.ncols();
310
311 if n_samples == 0 || n_features == 0 {
312 return Ok(0.0);
313 }
314
315 let valid_count = data.iter().filter(|&&x| x.is_finite()).count();
317 let completeness = valid_count as f64 / data.len() as f64;
318
319 let variances: Array1<f64> = data.var_axis(scirs2_core::ndarray::Axis(0), 1.0);
321 let non_zero_var_count = variances.iter().filter(|&&x| x > 1e-10).count();
322 let variance_score = non_zero_var_count as f64 / n_features as f64;
323
324 let size_score = ((n_samples as f64).ln() / 10.0).clamp(0.0, 1.0);
326
327 let quality_score = (completeness + variance_score + size_score) / 3.0;
329
330 Ok(quality_score.clamp(0.0, 1.0))
331}
332
333#[allow(dead_code)]
335pub fn analyze_dataset_advanced(
336 dataset: &Dataset,
337) -> Result<AdvancedQualityMetrics, Box<dyn Error>> {
338 let analyzer = AdvancedDatasetAnalyzer::new()
339 .with_gpu(false)
340 .with_advanced_precision(true)
341 .with_significance_threshold(0.05);
342
343 analyzer.analyze_dataset_quality(dataset)
344}
345
346#[cfg(test)]
347mod tests {
348 use super::*;
349 use scirs2_core::ndarray::Array2;
350
351 #[test]
352 fn test_quick_quality_assessment() {
353 let data = Array2::from_shape_vec((10, 3), (0..30).map(|x| x as f64).collect())
354 .expect("Operation failed");
355 let dataset = Dataset::new(data, None);
356
357 let quality = quick_quality_assessment(&dataset).expect("Operation failed");
358 assert!((0.0..=1.0).contains(&quality));
359 }
360
361 #[test]
362 fn test_advanced_dataset_analyzer() {
363 let data = Array2::from_shape_vec((10, 3), (0..30).map(|x| x as f64).collect())
364 .expect("Operation failed");
365 let dataset = Dataset::new(data, None);
366
367 let analyzer = AdvancedDatasetAnalyzer::new()
368 .with_gpu(false)
369 .with_advanced_precision(true);
370
371 let metrics = analyzer
372 .analyze_dataset_quality(&dataset)
373 .expect("Operation failed");
374 assert!(metrics.complexity_score >= 0.0);
375 assert!(metrics.entropy >= 0.0);
376 assert!(metrics.outlier_score >= 0.0);
377 assert!(metrics.ml_quality_score >= 0.0);
378 }
379
380 #[test]
381 fn test_normality_assessment() {
382 let data = Array2::from_shape_vec((20, 2), (0..40).map(|x| x as f64).collect())
383 .expect("Operation failed");
384 let dataset = Dataset::new(data, None);
385
386 let analyzer = AdvancedDatasetAnalyzer::new();
387 let metrics = analyzer
388 .analyze_dataset_quality(&dataset)
389 .expect("Operation failed");
390
391 assert!(metrics.normality_assessment.overall_normality >= 0.0);
392 assert!(metrics.normality_assessment.overall_normality <= 1.0);
393 assert_eq!(metrics.normality_assessment.shapiro_wilk_scores.len(), 2);
394 }
395
396 #[test]
397 fn test_correlation_insights() {
398 let data = Array2::from_shape_vec((15, 3), (0..45).map(|x| x as f64).collect())
399 .expect("Operation failed");
400 let dataset = Dataset::new(data, None);
401
402 let analyzer = AdvancedDatasetAnalyzer::new();
403 let metrics = analyzer
404 .analyze_dataset_quality(&dataset)
405 .expect("Operation failed");
406
407 assert_eq!(metrics.correlation_insights.feature_importance.len(), 3);
408 assert!(metrics
409 .correlation_insights
410 .feature_importance
411 .iter()
412 .all(|&x| (0.0..=1.0).contains(&x)));
413 }
414}