scirs2_datasets/utils/
advanced_analytics.rs1use super::Dataset;
7use scirs2_core::ndarray::{Array1, Array2};
8use statrs::statistics::Statistics;
9use std::error::Error;
10
11#[derive(Debug, Clone)]
13pub struct CorrelationInsights {
14 pub feature_importance: Array1<f64>,
16}
17
18#[derive(Debug, Clone)]
20pub struct NormalityAssessment {
21 pub overall_normality: f64,
23 pub shapiro_wilk_scores: Array1<f64>,
25}
26
27#[derive(Debug, Clone)]
29pub struct AdvancedQualityMetrics {
30 pub complexity_score: f64,
32 pub entropy: f64,
34 pub outlier_score: f64,
36 pub ml_quality_score: f64,
38 pub normality_assessment: NormalityAssessment,
40 pub correlation_insights: CorrelationInsights,
42}
43
44#[derive(Debug, Clone)]
46pub struct AdvancedDatasetAnalyzer {
47 gpu_enabled: bool,
48 advanced_precision: bool,
49 significance_threshold: f64,
50}
51
52impl Default for AdvancedDatasetAnalyzer {
53 fn default() -> Self {
54 Self {
55 gpu_enabled: false,
56 advanced_precision: false,
57 significance_threshold: 0.05,
58 }
59 }
60}
61
62impl AdvancedDatasetAnalyzer {
63 pub fn new() -> Self {
65 Self::default()
66 }
67
68 pub fn with_gpu(mut self, enabled: bool) -> Self {
70 self.gpu_enabled = enabled;
71 self
72 }
73
74 pub fn with_advanced_precision(mut self, enabled: bool) -> Self {
76 self.advanced_precision = enabled;
77 self
78 }
79
80 pub fn with_significance_threshold(mut self, threshold: f64) -> Self {
82 self.significance_threshold = threshold;
83 self
84 }
85
86 pub fn analyze_dataset_quality(
88 &self,
89 dataset: &Dataset,
90 ) -> Result<AdvancedQualityMetrics, Box<dyn Error>> {
91 let data = &dataset.data;
92 let _n_features = data.ncols();
93
94 let _mean_values: Array1<f64> = data.mean_axis(scirs2_core::ndarray::Axis(0)).unwrap();
96 let _std_values: Array1<f64> = data
97 .var_axis(scirs2_core::ndarray::Axis(0), 1.0)
98 .mapv(|x| x.sqrt());
99
100 let complexity_score = self.calculate_complexity_score(data)?;
102
103 let entropy = self.calculate_entropy(data)?;
105
106 let outlier_score = self.calculate_outlier_score(data)?;
108
109 let ml_quality_score = self.calculate_ml_quality_score(data)?;
111
112 let normality_assessment = self.calculate_normality_assessment(data)?;
114
115 let correlation_insights = self.calculate_correlation_insights(data)?;
117
118 Ok(AdvancedQualityMetrics {
119 complexity_score,
120 entropy,
121 outlier_score,
122 ml_quality_score,
123 normality_assessment,
124 correlation_insights,
125 })
126 }
127
128 fn calculate_complexity_score(&self, data: &Array2<f64>) -> Result<f64, Box<dyn Error>> {
129 let var_mean = {
131 let val = data.var_axis(scirs2_core::ndarray::Axis(0), 1.0).mean();
132 if val.is_nan() {
133 1.0
134 } else {
135 val
136 }
137 };
138 let complexity = (var_mean.ln() + 1.0).clamp(0.0, 1.0);
139 Ok(complexity)
140 }
141
142 fn calculate_entropy(&self, data: &Array2<f64>) -> Result<f64, Box<dyn Error>> {
143 let flattened = data.iter().cloned().collect::<Vec<f64>>();
145 let mut sorted = flattened.clone();
146 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
147
148 let n = sorted.len() as f64;
150 let entropy = if n > 0.0 {
151 (n.ln() / 2.0).clamp(0.0, 5.0)
152 } else {
153 0.0
154 };
155 Ok(entropy)
156 }
157
158 fn calculate_outlier_score(&self, data: &Array2<f64>) -> Result<f64, Box<dyn Error>> {
159 let threshold = 3.0;
161 let mut outlier_count = 0;
162 let total_count = data.len();
163
164 for col in 0..data.ncols() {
165 let column = data.column(col);
166 let mean = {
167 let val = column.mean();
168 if val.is_nan() {
169 0.0
170 } else {
171 val
172 }
173 };
174 let std = column.var(1.0).sqrt();
175
176 if std > 0.0 {
177 for &value in column.iter() {
178 let z_score = (value - mean).abs() / std;
179 if z_score > threshold {
180 outlier_count += 1;
181 }
182 }
183 }
184 }
185
186 let outlier_ratio = outlier_count as f64 / total_count as f64;
187 Ok(outlier_ratio.min(1.0))
188 }
189
190 fn calculate_ml_quality_score(&self, data: &Array2<f64>) -> Result<f64, Box<dyn Error>> {
191 let var_scores: Array1<f64> = data.var_axis(scirs2_core::ndarray::Axis(0), 1.0);
193 let mean_variance = {
194 let val = var_scores.mean();
195 if val.is_nan() {
196 1.0
197 } else {
198 val
199 }
200 };
201
202 let quality_score = (mean_variance.ln() + 5.0) / 10.0;
204 Ok(quality_score.clamp(0.0, 1.0))
205 }
206
207 fn calculate_normality_assessment(
208 &self,
209 data: &Array2<f64>,
210 ) -> Result<NormalityAssessment, Box<dyn Error>> {
211 let n_features = data.ncols();
212 let mut shapiro_scores = Vec::with_capacity(n_features);
213
214 for col in 0..n_features {
215 let column = data.column(col);
216 let score = self.simplified_normality_test(&column)?;
218 shapiro_scores.push(score);
219 }
220
221 let shapiro_wilk_scores = Array1::from_vec(shapiro_scores);
222 let overall_normality = {
223 let val = shapiro_wilk_scores.view().mean();
224 if val.is_nan() {
225 0.5
226 } else {
227 val
228 }
229 };
230
231 Ok(NormalityAssessment {
232 overall_normality,
233 shapiro_wilk_scores,
234 })
235 }
236
237 fn simplified_normality_test(
238 &self,
239 data: &scirs2_core::ndarray::ArrayView1<f64>,
240 ) -> Result<f64, Box<dyn Error>> {
241 let n = data.len();
243 if n < 3 {
244 return Ok(0.5);
245 }
246
247 let mean = {
248 match data.mean() {
249 Some(val) if !val.is_nan() => val,
250 _ => 0.0,
251 }
252 };
253 let variance = data.var(1.0);
254
255 if variance == 0.0 {
256 return Ok(0.0);
257 }
258
259 let std_dev = variance.sqrt();
260
261 let mut skewness: f64 = 0.0;
263 let mut kurtosis: f64 = 0.0;
264
265 for &value in data.iter() {
266 let normalized = (value - mean) / std_dev;
267 skewness += normalized.powi(3);
268 kurtosis += normalized.powi(4);
269 }
270
271 skewness /= n as f64;
272 kurtosis = kurtosis / (n as f64) - 3.0; let skew_penalty = (skewness.abs() / 2.0).min(1.0);
276 let kurt_penalty = (kurtosis.abs() / 4.0).min(1.0);
277 let normality_score: f64 = 1.0 - (skew_penalty + kurt_penalty) / 2.0;
278
279 Ok(normality_score.clamp(0.0, 1.0))
280 }
281
282 fn calculate_correlation_insights(
283 &self,
284 data: &Array2<f64>,
285 ) -> Result<CorrelationInsights, Box<dyn Error>> {
286 let n_features = data.ncols();
287 let mut importance_scores = Vec::with_capacity(n_features);
288
289 for i in 0..n_features {
291 let feature = data.column(i);
292 let variance = feature.var(1.0);
293
294 let importance = (variance.ln() + 1.0).clamp(0.0, 1.0);
296 importance_scores.push(importance);
297 }
298
299 let feature_importance = Array1::from_vec(importance_scores);
300
301 Ok(CorrelationInsights { feature_importance })
302 }
303}
304
305pub fn quick_quality_assessment(dataset: &Dataset) -> Result<f64, Box<dyn Error>> {
307 let data = &dataset.data;
308
309 let n_samples = data.nrows();
311 let n_features = data.ncols();
312
313 if n_samples == 0 || n_features == 0 {
314 return Ok(0.0);
315 }
316
317 let valid_count = data.iter().filter(|&&x| x.is_finite()).count();
319 let completeness = valid_count as f64 / data.len() as f64;
320
321 let variances: Array1<f64> = data.var_axis(scirs2_core::ndarray::Axis(0), 1.0);
323 let non_zero_var_count = variances.iter().filter(|&&x| x > 1e-10).count();
324 let variance_score = non_zero_var_count as f64 / n_features as f64;
325
326 let size_score = ((n_samples as f64).ln() / 10.0).clamp(0.0, 1.0);
328
329 let quality_score = (completeness + variance_score + size_score) / 3.0;
331
332 Ok(quality_score.clamp(0.0, 1.0))
333}
334
335#[allow(dead_code)]
337pub fn analyze_dataset_advanced(
338 dataset: &Dataset,
339) -> Result<AdvancedQualityMetrics, Box<dyn Error>> {
340 let analyzer = AdvancedDatasetAnalyzer::new()
341 .with_gpu(false)
342 .with_advanced_precision(true)
343 .with_significance_threshold(0.05);
344
345 analyzer.analyze_dataset_quality(dataset)
346}
347
348#[cfg(test)]
349mod tests {
350 use super::*;
351 use scirs2_core::ndarray::Array2;
352
353 #[test]
354 fn test_quick_quality_assessment() {
355 let data = Array2::from_shape_vec((10, 3), (0..30).map(|x| x as f64).collect()).unwrap();
356 let dataset = Dataset::new(data, None);
357
358 let quality = quick_quality_assessment(&dataset).unwrap();
359 assert!((0.0..=1.0).contains(&quality));
360 }
361
362 #[test]
363 fn test_advanced_dataset_analyzer() {
364 let data = Array2::from_shape_vec((10, 3), (0..30).map(|x| x as f64).collect()).unwrap();
365 let dataset = Dataset::new(data, None);
366
367 let analyzer = AdvancedDatasetAnalyzer::new()
368 .with_gpu(false)
369 .with_advanced_precision(true);
370
371 let metrics = analyzer.analyze_dataset_quality(&dataset).unwrap();
372 assert!(metrics.complexity_score >= 0.0);
373 assert!(metrics.entropy >= 0.0);
374 assert!(metrics.outlier_score >= 0.0);
375 assert!(metrics.ml_quality_score >= 0.0);
376 }
377
378 #[test]
379 fn test_normality_assessment() {
380 let data = Array2::from_shape_vec((20, 2), (0..40).map(|x| x as f64).collect()).unwrap();
381 let dataset = Dataset::new(data, None);
382
383 let analyzer = AdvancedDatasetAnalyzer::new();
384 let metrics = analyzer.analyze_dataset_quality(&dataset).unwrap();
385
386 assert!(metrics.normality_assessment.overall_normality >= 0.0);
387 assert!(metrics.normality_assessment.overall_normality <= 1.0);
388 assert_eq!(metrics.normality_assessment.shapiro_wilk_scores.len(), 2);
389 }
390
391 #[test]
392 fn test_correlation_insights() {
393 let data = Array2::from_shape_vec((15, 3), (0..45).map(|x| x as f64).collect()).unwrap();
394 let dataset = Dataset::new(data, None);
395
396 let analyzer = AdvancedDatasetAnalyzer::new();
397 let metrics = analyzer.analyze_dataset_quality(&dataset).unwrap();
398
399 assert_eq!(metrics.correlation_insights.feature_importance.len(), 3);
400 assert!(metrics
401 .correlation_insights
402 .feature_importance
403 .iter()
404 .all(|&x| (0.0..=1.0).contains(&x)));
405 }
406}