1use crate::error::{DatasetsError, Result};
8use crate::utils::Dataset;
9use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2, Axis};
10use scirs2_core::parallel_ops::*;
11use statrs::statistics::Statistics;
12
13#[derive(Debug, Clone)]
15pub struct AdvancedQualityMetrics {
16 pub complexity_score: f64,
18 pub entropy: f64,
20 pub outlier_score: f64,
22 pub interaction_matrix: Array2<f64>,
24 pub normality_assessment: NormalityAssessment,
26 pub ml_quality_score: f64,
28 pub correlation_insights: CorrelationInsights,
30}
31
32#[derive(Debug, Clone)]
34pub struct NormalityAssessment {
35 pub shapiro_wilk_scores: Array1<f64>,
37 pub anderson_darling_scores: Array1<f64>,
39 pub jarque_bera_scores: Array1<f64>,
41 pub overall_normality: f64,
43}
44
45#[derive(Debug, Clone)]
47pub struct CorrelationInsights {
48 pub linear_correlations: Array2<f64>,
50 pub nonlinear_correlations: Array2<f64>,
52 pub causality_hints: Array2<f64>,
54 pub feature_importance: Array1<f64>,
56}
57
58pub struct AdvancedDatasetAnalyzer {
60 use_gpu: bool,
62 advanced_precision: bool,
64 significance_threshold: f64,
66}
67
68impl Default for AdvancedDatasetAnalyzer {
69 fn default() -> Self {
70 Self {
71 use_gpu: true,
72 advanced_precision: true,
73 significance_threshold: 0.01,
74 }
75 }
76}
77
78impl AdvancedDatasetAnalyzer {
79 pub fn new() -> Self {
81 Self::default()
82 }
83
84 pub fn with_gpu(mut self, use_gpu: bool) -> Self {
86 self.use_gpu = use_gpu;
87 self
88 }
89
90 pub fn with_advanced_precision(mut self, advanced_precision: bool) -> Self {
92 self.advanced_precision = advanced_precision;
93 self
94 }
95
96 pub fn with_significance_threshold(mut self, threshold: f64) -> Self {
98 self.significance_threshold = threshold;
99 self
100 }
101
102 pub fn analyze_dataset_quality(&self, dataset: &Dataset) -> Result<AdvancedQualityMetrics> {
104 let data = &dataset.data;
105 let n_samples = data.nrows();
106 let n_features = data.ncols();
107
108 if n_samples < 3 || n_features == 0 {
109 return Err(DatasetsError::ValidationError(
110 "Dataset too small for advanced analysis".to_string(),
111 ));
112 }
113
114 let complexity_score = self.calculate_complexity_score(data.view())?;
116
117 let entropy = self.calculate_dataset_entropy(data.view())?;
119
120 let outlier_score = self.calculate_outlier_score(data.view())?;
122
123 let interaction_matrix = self.calculate_interaction_matrix(data.view())?;
125
126 let normality_assessment = self.assess_normality(data.view())?;
128
129 let ml_quality_score = self.predict_ml_quality(data.view())?;
131
132 let correlation_insights = self.analyze_correlations(data.view())?;
134
135 Ok(AdvancedQualityMetrics {
136 complexity_score,
137 entropy,
138 outlier_score,
139 interaction_matrix,
140 normality_assessment,
141 ml_quality_score,
142 correlation_insights,
143 })
144 }
145
146 fn calculate_complexity_score(&self, data: ArrayView2<f64>) -> Result<f64> {
148 let n_features = data.ncols();
149 let complexity_scores = (0..n_features)
151 .into_par_iter()
152 .map(|i| {
153 let feature = data.column(i);
154 self.calculate_feature_complexity(feature)
155 })
156 .collect::<Result<Vec<_>>>()?;
157
158 let product: f64 = complexity_scores.iter().product();
160 Ok(product.powf(1.0 / n_features as f64))
161 }
162
163 fn calculate_feature_complexity(&self, feature: ArrayView1<f64>) -> Result<f64> {
165 let mut values = feature.to_vec();
167 values.sort_by(|a, b| a.partial_cmp(b).unwrap());
168
169 let n_bins = ((values.len() as f64).sqrt() as usize).clamp(10, 100);
171 let min_val = values[0];
172 let max_val = values[values.len() - 1];
173
174 if (max_val - min_val).abs() < f64::EPSILON {
175 return Ok(0.0); }
177
178 let bin_width = (max_val - min_val) / n_bins as f64;
179 let mut histogram = vec![0; n_bins];
180
181 for &value in &values {
182 let bin_idx = ((value - min_val) / bin_width) as usize;
183 let bin_idx = bin_idx.min(n_bins - 1);
184 histogram[bin_idx] += 1;
185 }
186
187 let n_total = values.len() as f64;
189 let entropy = histogram
190 .iter()
191 .filter(|&&count| count > 0)
192 .map(|&count| {
193 let p = count as f64 / n_total;
194 -p * p.ln()
195 })
196 .sum::<f64>();
197
198 let max_entropy = (n_bins as f64).ln();
200 Ok(entropy / max_entropy)
201 }
202
203 fn calculate_dataset_entropy(&self, data: ArrayView2<f64>) -> Result<f64> {
205 let n_features = data.ncols();
206
207 let feature_entropies: Vec<f64> = (0..n_features)
209 .into_par_iter()
210 .map(|i| {
211 let feature = data.column(i);
212 self.calculate_feature_complexity(feature).unwrap_or(0.0)
213 })
214 .collect();
215
216 let mean_entropy = feature_entropies.iter().sum::<f64>() / n_features as f64;
218
219 let mutual_info_correction = self.estimate_mutual_information(data)?;
221
222 Ok((mean_entropy * n_features as f64 - mutual_info_correction).max(0.0))
223 }
224
225 fn estimate_mutual_information(&self, data: ArrayView2<f64>) -> Result<f64> {
227 let n_features = data.ncols();
228 if n_features < 2 {
229 return Ok(0.0);
230 }
231
232 let max_pairs = 100; let step = ((n_features * (n_features - 1) / 2) / max_pairs).max(1);
235
236 let mut total_mi = 0.0;
237 let mut pair_count = 0;
238
239 for i in (0..n_features).step_by(step) {
240 for j in (i + 1..n_features).step_by(step) {
241 let mi = self.calculate_mutual_information(data.column(i), data.column(j))?;
242 total_mi += mi;
243 pair_count += 1;
244 }
245 }
246
247 Ok(if pair_count > 0 {
248 total_mi / pair_count as f64
249 } else {
250 0.0
251 })
252 }
253
254 fn calculate_mutual_information(&self, x: ArrayView1<f64>, y: ArrayView1<f64>) -> Result<f64> {
256 let n_bins = 20; let x_min = x.iter().fold(f64::INFINITY, |a, &b| a.min(b));
260 let x_max = x.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
261 let y_min = y.iter().fold(f64::INFINITY, |a, &b| a.min(b));
262 let y_max = y.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
263
264 if (x_max - x_min).abs() < f64::EPSILON || (y_max - y_min).abs() < f64::EPSILON {
265 return Ok(0.0);
266 }
267
268 let x_bin_width = (x_max - x_min) / n_bins as f64;
269 let y_bin_width = (y_max - y_min) / n_bins as f64;
270
271 let mut joint_hist = vec![vec![0; n_bins]; n_bins];
272 let mut x_hist = vec![0; n_bins];
273 let mut y_hist = vec![0; n_bins];
274
275 let n_samples = x.len();
276 for i in 0..n_samples {
277 let x_bin = ((x[i] - x_min) / x_bin_width) as usize;
278 let y_bin = ((y[i] - y_min) / y_bin_width) as usize;
279 let x_bin = x_bin.min(n_bins - 1);
280 let y_bin = y_bin.min(n_bins - 1);
281
282 joint_hist[x_bin][y_bin] += 1;
283 x_hist[x_bin] += 1;
284 y_hist[y_bin] += 1;
285 }
286
287 let n_total = n_samples as f64;
289 let mut mi = 0.0;
290
291 for i in 0..n_bins {
292 for (j, _) in y_hist.iter().enumerate().take(n_bins) {
293 if joint_hist[i][j] > 0 && x_hist[i] > 0 && y_hist[j] > 0 {
294 let p_xy = joint_hist[i][j] as f64 / n_total;
295 let p_x = x_hist[i] as f64 / n_total;
296 let p_y = y_hist[j] as f64 / n_total;
297
298 mi += p_xy * (p_xy / (p_x * p_y)).ln();
299 }
300 }
301 }
302
303 Ok(mi.max(0.0))
304 }
305
306 fn calculate_outlier_score(&self, data: ArrayView2<f64>) -> Result<f64> {
308 let n_samples = data.nrows();
309 if n_samples < 3 {
310 return Ok(0.0);
311 }
312
313 let mean = data.mean_axis(Axis(0)).unwrap();
315
316 let cov_matrix = self.calculate_covariance_matrix(data, &mean)?;
318
319 let distances: Vec<f64> = (0..n_samples)
321 .into_par_iter()
322 .map(|i| {
323 let sample = data.row(i);
324 self.mahalanobis_distance(sample, &mean, &cov_matrix)
325 .unwrap_or(0.0)
326 })
327 .collect();
328
329 let mean_distance = distances.iter().sum::<f64>() / distances.len() as f64;
331 let distance_std = {
332 let variance = distances
333 .iter()
334 .map(|&d| (d - mean_distance).powi(2))
335 .sum::<f64>()
336 / distances.len() as f64;
337 variance.sqrt()
338 };
339
340 let threshold = mean_distance + 3.0 * distance_std;
342 let outlier_count = distances.iter().filter(|&&d| d > threshold).count();
343
344 Ok(outlier_count as f64 / n_samples as f64)
345 }
346
347 fn calculate_covariance_matrix(
349 &self,
350 data: ArrayView2<f64>,
351 mean: &Array1<f64>,
352 ) -> Result<Array2<f64>> {
353 let n_samples = data.nrows();
354 let n_features = data.ncols();
355 let mut cov_matrix = Array2::zeros((n_features, n_features));
356
357 for i in 0..n_features {
358 for j in i..n_features {
359 let mut covariance = 0.0;
360 for k in 0..n_samples {
361 covariance += (data[[k, i]] - mean[i]) * (data[[k, j]] - mean[j]);
362 }
363 covariance /= (n_samples - 1) as f64;
364
365 cov_matrix[[i, j]] = covariance;
366 if i != j {
367 cov_matrix[[j, i]] = covariance;
368 }
369 }
370 }
371
372 Ok(cov_matrix)
373 }
374
375 fn mahalanobis_distance(
377 &self,
378 sample: ArrayView1<f64>,
379 mean: &Array1<f64>,
380 cov_matrix: &Array2<f64>,
381 ) -> Result<f64> {
382 let diff = &(sample.to_owned() - mean);
383
384 let mut distance_squared = 0.0;
386 for i in 0..diff.len() {
387 let variance = cov_matrix[[i, i]];
388 if variance > f64::EPSILON {
389 distance_squared += diff[i].powi(2) / variance;
390 }
391 }
392
393 Ok(distance_squared.sqrt())
394 }
395
396 fn calculate_interaction_matrix(&self, data: ArrayView2<f64>) -> Result<Array2<f64>> {
398 let n_features = data.ncols();
399 let mut interaction_matrix = Array2::zeros((n_features, n_features));
400
401 for i in 0..n_features {
403 for j in i..n_features {
404 let interaction = if i == j {
405 1.0 } else {
407 self.calculate_mutual_information(data.column(i), data.column(j))?
408 };
409
410 interaction_matrix[[i, j]] = interaction;
411 interaction_matrix[[j, i]] = interaction;
412 }
413 }
414
415 Ok(interaction_matrix)
416 }
417
418 fn assess_normality(&self, data: ArrayView2<f64>) -> Result<NormalityAssessment> {
420 let n_features = data.ncols();
421
422 let shapiro_wilk_scores = Array1::from_vec(
423 (0..n_features)
424 .into_par_iter()
425 .map(|i| self.shapiro_wilk_test(data.column(i)))
426 .collect::<Result<Vec<_>>>()?,
427 );
428
429 let anderson_darling_scores = Array1::from_vec(
430 (0..n_features)
431 .into_par_iter()
432 .map(|i| self.anderson_darling_test(data.column(i)))
433 .collect::<Result<Vec<_>>>()?,
434 );
435
436 let jarque_bera_scores = Array1::from_vec(
437 (0..n_features)
438 .into_par_iter()
439 .map(|i| self.jarque_bera_test(data.column(i)))
440 .collect::<Result<Vec<_>>>()?,
441 );
442
443 let overall_normality = {
445 let mean_shapiro = {
446 let val = shapiro_wilk_scores.view().mean();
447 if val.is_nan() {
448 0.0
449 } else {
450 val
451 }
452 };
453 let mean_anderson = {
454 let val = anderson_darling_scores.view().mean();
455 if val.is_nan() {
456 0.0
457 } else {
458 val
459 }
460 };
461 let mean_jarque = {
462 let val = jarque_bera_scores.view().mean();
463 if val.is_nan() {
464 0.0
465 } else {
466 val
467 }
468 };
469
470 (mean_shapiro * 0.4 + mean_anderson * 0.3 + mean_jarque * 0.3).clamp(0.0, 1.0)
471 };
472
473 Ok(NormalityAssessment {
474 shapiro_wilk_scores,
475 anderson_darling_scores,
476 jarque_bera_scores,
477 overall_normality,
478 })
479 }
480
481 fn shapiro_wilk_test(&self, data: ArrayView1<f64>) -> Result<f64> {
483 let n = data.len();
484 if n < 3 {
485 return Ok(0.0);
486 }
487
488 let mut sorted_data = data.to_vec();
489 sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
490
491 let mean = {
493 let val = data.mean();
494 if val.is_nan() {
495 0.0
496 } else {
497 val
498 }
499 };
500 let variance = data.var(1.0);
501
502 if variance <= f64::EPSILON {
503 return Ok(1.0); }
505
506 let std_dev = variance.sqrt();
507
508 let skewness = data
510 .iter()
511 .map(|&x| ((x - mean) / std_dev).powi(3))
512 .sum::<f64>()
513 / n as f64;
514
515 let kurtosis = data
517 .iter()
518 .map(|&x| ((x - mean) / std_dev).powi(4))
519 .sum::<f64>()
520 / n as f64
521 - 3.0;
522
523 let skewness_score = (-skewness.abs()).exp();
525 let kurtosis_score = (-kurtosis.abs()).exp();
526
527 Ok((skewness_score + kurtosis_score) / 2.0)
528 }
529
530 fn anderson_darling_test(&self, data: ArrayView1<f64>) -> Result<f64> {
532 let shapiro_score = self.shapiro_wilk_test(data)?;
534
535 let n = data.len() as f64;
537 let adjustment = (1.0 / (1.0 + n / 100.0)).max(0.8);
538
539 Ok(shapiro_score * adjustment)
540 }
541
542 fn jarque_bera_test(&self, data: ArrayView1<f64>) -> Result<f64> {
544 let n = data.len();
545 if n < 3 {
546 return Ok(0.0);
547 }
548
549 let mean = {
550 let val = data.mean();
551 if val.is_nan() {
552 0.0
553 } else {
554 val
555 }
556 };
557 let variance = data.var(1.0);
558
559 if variance <= f64::EPSILON {
560 return Ok(1.0);
561 }
562
563 let std_dev = variance.sqrt();
564
565 let skewness = data
567 .iter()
568 .map(|&x| ((x - mean) / std_dev).powi(3))
569 .sum::<f64>()
570 / n as f64;
571
572 let kurtosis = data
574 .iter()
575 .map(|&x| ((x - mean) / std_dev).powi(4))
576 .sum::<f64>()
577 / n as f64
578 - 3.0;
579
580 let jb_stat = (n as f64 / 6.0) * (skewness.powi(2) + kurtosis.powi(2) / 4.0);
582
583 Ok((-jb_stat / 10.0).exp())
585 }
586
587 fn predict_ml_quality(&self, data: ArrayView2<f64>) -> Result<f64> {
589 let n_samples = data.nrows();
590 let n_features = data.ncols();
591
592 if n_samples < 10 || n_features == 0 {
593 return Ok(0.1); }
595
596 let size_factor = (n_samples as f64 / (n_samples as f64 + 100.0)).min(1.0);
598 let dimensionality_factor = (n_features as f64 / (n_features as f64 + 50.0)).min(1.0);
599
600 let missing_rate = self.calculate_missing_rate(data);
602 let completeness_factor = 1.0 - missing_rate;
603
604 let variance_factor = self.calculate_variance_quality(data)?;
606
607 let quality_score = (size_factor * 0.25
609 + dimensionality_factor * 0.15
610 + completeness_factor * 0.35
611 + variance_factor * 0.25)
612 .clamp(0.0, 1.0);
613
614 Ok(quality_score)
615 }
616
617 fn calculate_missing_rate(&self, data: ArrayView2<f64>) -> f64 {
619 let total_elements = data.len();
620 let missing_count = data
621 .iter()
622 .filter(|&&x| x.is_nan() || x.is_infinite())
623 .count();
624
625 missing_count as f64 / total_elements as f64
626 }
627
628 fn calculate_variance_quality(&self, data: ArrayView2<f64>) -> Result<f64> {
630 let n_features = data.ncols();
631 if n_features == 0 {
632 return Ok(0.0);
633 }
634
635 let variances: Vec<f64> = (0..n_features).map(|i| data.column(i).var(1.0)).collect();
636
637 let mean_variance = variances.iter().sum::<f64>() / n_features as f64;
639
640 if mean_variance <= f64::EPSILON {
641 return Ok(0.1); }
643
644 let variance_cv = {
645 let variance_of_variances = variances
646 .iter()
647 .map(|&v| (v - mean_variance).powi(2))
648 .sum::<f64>()
649 / n_features as f64;
650 variance_of_variances.sqrt() / mean_variance
651 };
652
653 Ok((1.0 / (1.0 + variance_cv)).max(0.1))
655 }
656
657 fn analyze_correlations(&self, data: ArrayView2<f64>) -> Result<CorrelationInsights> {
659 let n_features = data.ncols();
660
661 let linear_correlations = self.calculate_correlation_matrix(data)?;
663
664 let nonlinear_correlations = self.calculate_interaction_matrix(data)?;
666
667 let causality_hints = self.estimate_causality_matrix(data)?;
669
670 let feature_importance = Array1::from_vec(
672 (0..n_features)
673 .map(|i| {
674 let mut total_correlation = 0.0;
675 for j in 0..n_features {
676 if i != j {
677 total_correlation += linear_correlations[[i, j]].abs();
678 }
679 }
680 total_correlation / (n_features - 1) as f64
681 })
682 .collect(),
683 );
684
685 Ok(CorrelationInsights {
686 linear_correlations,
687 nonlinear_correlations,
688 causality_hints,
689 feature_importance,
690 })
691 }
692
693 fn calculate_correlation_matrix(&self, data: ArrayView2<f64>) -> Result<Array2<f64>> {
695 let n_features = data.ncols();
696 let mut corr_matrix = Array2::zeros((n_features, n_features));
697
698 for i in 0..n_features {
699 for j in i..n_features {
700 let correlation = if i == j {
701 1.0
702 } else {
703 self.pearson_correlation(data.column(i), data.column(j))?
704 };
705
706 corr_matrix[[i, j]] = correlation;
707 corr_matrix[[j, i]] = correlation;
708 }
709 }
710
711 Ok(corr_matrix)
712 }
713
714 fn pearson_correlation(&self, x: ArrayView1<f64>, y: ArrayView1<f64>) -> Result<f64> {
716 let n = x.len();
717 if n != y.len() || n < 2 {
718 return Ok(0.0);
719 }
720
721 let mean_x = {
722 let val = x.mean();
723 if val.is_nan() {
724 0.0
725 } else {
726 val
727 }
728 };
729 let mean_y = {
730 let val = y.mean();
731 if val.is_nan() {
732 0.0
733 } else {
734 val
735 }
736 };
737
738 let mut numerator = 0.0;
739 let mut sum_sq_x = 0.0;
740 let mut sum_sq_y = 0.0;
741
742 for i in 0..n {
743 let dx = x[i] - mean_x;
744 let dy = y[i] - mean_y;
745
746 numerator += dx * dy;
747 sum_sq_x += dx * dx;
748 sum_sq_y += dy * dy;
749 }
750
751 let denominator = (sum_sq_x * sum_sq_y).sqrt();
752
753 if denominator <= f64::EPSILON {
754 Ok(0.0)
755 } else {
756 Ok(numerator / denominator)
757 }
758 }
759
760 fn estimate_causality_matrix(&self, data: ArrayView2<f64>) -> Result<Array2<f64>> {
762 let n_features = data.ncols();
763 let mut causality_matrix = Array2::zeros((n_features, n_features));
764
765 for i in 0..n_features {
768 for j in 0..n_features {
769 if i != j {
770 let correlation = self.pearson_correlation(data.column(i), data.column(j))?;
772 causality_matrix[[i, j]] = correlation.abs() * 0.5; }
774 }
775 }
776
777 Ok(causality_matrix)
778 }
779}
780
781#[allow(dead_code)]
783pub fn analyze_dataset_advanced(dataset: &Dataset) -> Result<AdvancedQualityMetrics> {
784 let analyzer = AdvancedDatasetAnalyzer::new();
785 analyzer.analyze_dataset_quality(dataset)
786}
787
788#[allow(dead_code)]
790pub fn quick_quality_assessment(dataset: &Dataset) -> Result<f64> {
791 let analyzer = AdvancedDatasetAnalyzer::new().with_advanced_precision(false);
792 let metrics = analyzer.analyze_dataset_quality(dataset)?;
793 Ok(metrics.ml_quality_score)
794}
795
796#[cfg(test)]
797mod tests {
798 use super::*;
799 use scirs2_core::ndarray::Array2;
800
801 #[allow(dead_code)]
802 fn create_test_dataset() -> Dataset {
803 let data = Array2::from_shape_vec((100, 3), (0..300).map(|x| x as f64).collect()).unwrap();
804 let target = Array1::from_vec((0..100).map(|x| (x % 2) as f64).collect());
805 Dataset::new(data, Some(target))
806 }
807
808 #[test]
809 fn test_advanced_analyzer_creation() {
810 let analyzer = AdvancedDatasetAnalyzer::new();
811 assert!(analyzer.use_gpu);
812 assert!(analyzer.advanced_precision);
813 }
814
815 #[test]
816 fn test_quick_quality_assessment() {
817 let dataset = create_test_dataset();
818 let quality = quick_quality_assessment(&dataset);
819 assert!(quality.is_ok());
820 let quality_score = quality.unwrap();
821 assert!((0.0..=1.0).contains(&quality_score));
822 }
823}