scirs2_datasets/utils/
advanced_analytics.rs1use super::Dataset;
7use ndarray::{Array1, Array2};
8use statrs::statistics::Statistics;
9use std::error::Error;
10
11#[derive(Debug, Clone)]
13pub struct CorrelationInsights {
14 pub feature_importance: Array1<f64>,
16}
17
18#[derive(Debug, Clone)]
20pub struct NormalityAssessment {
21 pub overall_normality: f64,
23 pub shapiro_wilk_scores: Array1<f64>,
25}
26
27#[derive(Debug, Clone)]
29pub struct AdvancedQualityMetrics {
30 pub complexity_score: f64,
32 pub entropy: f64,
34 pub outlier_score: f64,
36 pub ml_quality_score: f64,
38 pub normality_assessment: NormalityAssessment,
40 pub correlation_insights: CorrelationInsights,
42}
43
44#[derive(Debug, Clone)]
46pub struct AdvancedDatasetAnalyzer {
47 gpu_enabled: bool,
48 advanced_precision: bool,
49 significance_threshold: f64,
50}
51
52impl Default for AdvancedDatasetAnalyzer {
53 fn default() -> Self {
54 Self {
55 gpu_enabled: false,
56 advanced_precision: false,
57 significance_threshold: 0.05,
58 }
59 }
60}
61
62impl AdvancedDatasetAnalyzer {
63 pub fn new() -> Self {
65 Self::default()
66 }
67
68 pub fn with_gpu(mut self, enabled: bool) -> Self {
70 self.gpu_enabled = enabled;
71 self
72 }
73
74 pub fn with_advanced_precision(mut self, enabled: bool) -> Self {
76 self.advanced_precision = enabled;
77 self
78 }
79
80 pub fn with_significance_threshold(mut self, threshold: f64) -> Self {
82 self.significance_threshold = threshold;
83 self
84 }
85
86 pub fn analyze_dataset_quality(
88 &self,
89 dataset: &Dataset,
90 ) -> Result<AdvancedQualityMetrics, Box<dyn Error>> {
91 let data = &dataset.data;
92 let _n_features = data.ncols();
93
94 let _mean_values: Array1<f64> = data.mean_axis(ndarray::Axis(0)).unwrap();
96 let _std_values: Array1<f64> = data.var_axis(ndarray::Axis(0), 1.0).mapv(|x| x.sqrt());
97
98 let complexity_score = self.calculate_complexity_score(data)?;
100
101 let entropy = self.calculate_entropy(data)?;
103
104 let outlier_score = self.calculate_outlier_score(data)?;
106
107 let ml_quality_score = self.calculate_ml_quality_score(data)?;
109
110 let normality_assessment = self.calculate_normality_assessment(data)?;
112
113 let correlation_insights = self.calculate_correlation_insights(data)?;
115
116 Ok(AdvancedQualityMetrics {
117 complexity_score,
118 entropy,
119 outlier_score,
120 ml_quality_score,
121 normality_assessment,
122 correlation_insights,
123 })
124 }
125
126 fn calculate_complexity_score(&self, data: &Array2<f64>) -> Result<f64, Box<dyn Error>> {
127 let var_mean = {
129 let val = data.var_axis(ndarray::Axis(0), 1.0).mean();
130 if val.is_nan() {
131 1.0
132 } else {
133 val
134 }
135 };
136 let complexity = (var_mean.ln() + 1.0).clamp(0.0, 1.0);
137 Ok(complexity)
138 }
139
140 fn calculate_entropy(&self, data: &Array2<f64>) -> Result<f64, Box<dyn Error>> {
141 let flattened = data.iter().cloned().collect::<Vec<f64>>();
143 let mut sorted = flattened.clone();
144 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
145
146 let n = sorted.len() as f64;
148 let entropy = if n > 0.0 {
149 (n.ln() / 2.0).clamp(0.0, 5.0)
150 } else {
151 0.0
152 };
153 Ok(entropy)
154 }
155
156 fn calculate_outlier_score(&self, data: &Array2<f64>) -> Result<f64, Box<dyn Error>> {
157 let threshold = 3.0;
159 let mut outlier_count = 0;
160 let total_count = data.len();
161
162 for col in 0..data.ncols() {
163 let column = data.column(col);
164 let mean = {
165 let val = column.mean();
166 if val.is_nan() {
167 0.0
168 } else {
169 val
170 }
171 };
172 let std = column.var(1.0).sqrt();
173
174 if std > 0.0 {
175 for &value in column.iter() {
176 let z_score = (value - mean).abs() / std;
177 if z_score > threshold {
178 outlier_count += 1;
179 }
180 }
181 }
182 }
183
184 let outlier_ratio = outlier_count as f64 / total_count as f64;
185 Ok(outlier_ratio.min(1.0))
186 }
187
188 fn calculate_ml_quality_score(&self, data: &Array2<f64>) -> Result<f64, Box<dyn Error>> {
189 let var_scores: Array1<f64> = data.var_axis(ndarray::Axis(0), 1.0);
191 let mean_variance = {
192 let val = var_scores.mean();
193 if val.is_nan() {
194 1.0
195 } else {
196 val
197 }
198 };
199
200 let quality_score = (mean_variance.ln() + 5.0) / 10.0;
202 Ok(quality_score.clamp(0.0, 1.0))
203 }
204
205 fn calculate_normality_assessment(
206 &self,
207 data: &Array2<f64>,
208 ) -> Result<NormalityAssessment, Box<dyn Error>> {
209 let n_features = data.ncols();
210 let mut shapiro_scores = Vec::with_capacity(n_features);
211
212 for col in 0..n_features {
213 let column = data.column(col);
214 let score = self.simplified_normality_test(&column)?;
216 shapiro_scores.push(score);
217 }
218
219 let shapiro_wilk_scores = Array1::from_vec(shapiro_scores);
220 let overall_normality = {
221 let val = shapiro_wilk_scores.view().mean();
222 if val.is_nan() {
223 0.5
224 } else {
225 val
226 }
227 };
228
229 Ok(NormalityAssessment {
230 overall_normality,
231 shapiro_wilk_scores,
232 })
233 }
234
235 fn simplified_normality_test(
236 &self,
237 data: &ndarray::ArrayView1<f64>,
238 ) -> Result<f64, Box<dyn Error>> {
239 let n = data.len();
241 if n < 3 {
242 return Ok(0.5);
243 }
244
245 let mean = {
246 match data.mean() {
247 Some(val) if !val.is_nan() => val,
248 _ => 0.0,
249 }
250 };
251 let variance = data.var(1.0);
252
253 if variance == 0.0 {
254 return Ok(0.0);
255 }
256
257 let std_dev = variance.sqrt();
258
259 let mut skewness: f64 = 0.0;
261 let mut kurtosis: f64 = 0.0;
262
263 for &value in data.iter() {
264 let normalized = (value - mean) / std_dev;
265 skewness += normalized.powi(3);
266 kurtosis += normalized.powi(4);
267 }
268
269 skewness /= n as f64;
270 kurtosis = kurtosis / (n as f64) - 3.0; let skew_penalty = (skewness.abs() / 2.0).min(1.0);
274 let kurt_penalty = (kurtosis.abs() / 4.0).min(1.0);
275 let normality_score: f64 = 1.0 - (skew_penalty + kurt_penalty) / 2.0;
276
277 Ok(normality_score.clamp(0.0, 1.0))
278 }
279
280 fn calculate_correlation_insights(
281 &self,
282 data: &Array2<f64>,
283 ) -> Result<CorrelationInsights, Box<dyn Error>> {
284 let n_features = data.ncols();
285 let mut importance_scores = Vec::with_capacity(n_features);
286
287 for i in 0..n_features {
289 let feature = data.column(i);
290 let variance = feature.var(1.0);
291
292 let importance = (variance.ln() + 1.0).clamp(0.0, 1.0);
294 importance_scores.push(importance);
295 }
296
297 let feature_importance = Array1::from_vec(importance_scores);
298
299 Ok(CorrelationInsights { feature_importance })
300 }
301}
302
303pub fn quick_quality_assessment(dataset: &Dataset) -> Result<f64, Box<dyn Error>> {
305 let data = &dataset.data;
306
307 let n_samples = data.nrows();
309 let n_features = data.ncols();
310
311 if n_samples == 0 || n_features == 0 {
312 return Ok(0.0);
313 }
314
315 let valid_count = data.iter().filter(|&&x| x.is_finite()).count();
317 let completeness = valid_count as f64 / data.len() as f64;
318
319 let variances: Array1<f64> = data.var_axis(ndarray::Axis(0), 1.0);
321 let non_zero_var_count = variances.iter().filter(|&&x| x > 1e-10).count();
322 let variance_score = non_zero_var_count as f64 / n_features as f64;
323
324 let size_score = ((n_samples as f64).ln() / 10.0).clamp(0.0, 1.0);
326
327 let quality_score = (completeness + variance_score + size_score) / 3.0;
329
330 Ok(quality_score.clamp(0.0, 1.0))
331}
332
333#[allow(dead_code)]
335pub fn analyze_dataset_advanced(
336 dataset: &Dataset,
337) -> Result<AdvancedQualityMetrics, Box<dyn Error>> {
338 let analyzer = AdvancedDatasetAnalyzer::new()
339 .with_gpu(false)
340 .with_advanced_precision(true)
341 .with_significance_threshold(0.05);
342
343 analyzer.analyze_dataset_quality(dataset)
344}
345
346#[cfg(test)]
347mod tests {
348 use super::*;
349 use ndarray::Array2;
350
351 #[test]
352 fn test_quick_quality_assessment() {
353 let data = Array2::from_shape_vec((10, 3), (0..30).map(|x| x as f64).collect()).unwrap();
354 let dataset = Dataset::new(data, None);
355
356 let quality = quick_quality_assessment(&dataset).unwrap();
357 assert!((0.0..=1.0).contains(&quality));
358 }
359
360 #[test]
361 fn test_advanced_dataset_analyzer() {
362 let data = Array2::from_shape_vec((10, 3), (0..30).map(|x| x as f64).collect()).unwrap();
363 let dataset = Dataset::new(data, None);
364
365 let analyzer = AdvancedDatasetAnalyzer::new()
366 .with_gpu(false)
367 .with_advanced_precision(true);
368
369 let metrics = analyzer.analyze_dataset_quality(&dataset).unwrap();
370 assert!(metrics.complexity_score >= 0.0);
371 assert!(metrics.entropy >= 0.0);
372 assert!(metrics.outlier_score >= 0.0);
373 assert!(metrics.ml_quality_score >= 0.0);
374 }
375
376 #[test]
377 fn test_normality_assessment() {
378 let data = Array2::from_shape_vec((20, 2), (0..40).map(|x| x as f64).collect()).unwrap();
379 let dataset = Dataset::new(data, None);
380
381 let analyzer = AdvancedDatasetAnalyzer::new();
382 let metrics = analyzer.analyze_dataset_quality(&dataset).unwrap();
383
384 assert!(metrics.normality_assessment.overall_normality >= 0.0);
385 assert!(metrics.normality_assessment.overall_normality <= 1.0);
386 assert_eq!(metrics.normality_assessment.shapiro_wilk_scores.len(), 2);
387 }
388
389 #[test]
390 fn test_correlation_insights() {
391 let data = Array2::from_shape_vec((15, 3), (0..45).map(|x| x as f64).collect()).unwrap();
392 let dataset = Dataset::new(data, None);
393
394 let analyzer = AdvancedDatasetAnalyzer::new();
395 let metrics = analyzer.analyze_dataset_quality(&dataset).unwrap();
396
397 assert_eq!(metrics.correlation_insights.feature_importance.len(), 3);
398 assert!(metrics
399 .correlation_insights
400 .feature_importance
401 .iter()
402 .all(|&x| (0.0..=1.0).contains(&x)));
403 }
404}