use crate::error::{DatasetsError, Result};
use crate::utils::Dataset;
use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2, Axis};
use scirs2_core::parallel_ops::*;
use statrs::statistics::Statistics;
#[derive(Debug, Clone)]
pub struct AdvancedQualityMetrics {
pub complexity_score: f64,
pub entropy: f64,
pub outlier_score: f64,
pub interaction_matrix: Array2<f64>,
pub normality_assessment: NormalityAssessment,
pub ml_quality_score: f64,
pub correlation_insights: CorrelationInsights,
}
#[derive(Debug, Clone)]
pub struct NormalityAssessment {
pub shapiro_wilk_scores: Array1<f64>,
pub anderson_darling_scores: Array1<f64>,
pub jarque_bera_scores: Array1<f64>,
pub overall_normality: f64,
}
#[derive(Debug, Clone)]
pub struct CorrelationInsights {
pub linear_correlations: Array2<f64>,
pub nonlinear_correlations: Array2<f64>,
pub causality_hints: Array2<f64>,
pub feature_importance: Array1<f64>,
}
pub struct AdvancedDatasetAnalyzer {
use_gpu: bool,
advanced_precision: bool,
significance_threshold: f64,
}
impl Default for AdvancedDatasetAnalyzer {
fn default() -> Self {
Self {
use_gpu: true,
advanced_precision: true,
significance_threshold: 0.01,
}
}
}
impl AdvancedDatasetAnalyzer {
pub fn new() -> Self {
Self::default()
}
pub fn with_gpu(mut self, use_gpu: bool) -> Self {
self.use_gpu = use_gpu;
self
}
pub fn with_advanced_precision(mut self, advanced_precision: bool) -> Self {
self.advanced_precision = advanced_precision;
self
}
pub fn with_significance_threshold(mut self, threshold: f64) -> Self {
self.significance_threshold = threshold;
self
}
pub fn analyze_dataset_quality(&self, dataset: &Dataset) -> Result<AdvancedQualityMetrics> {
let data = &dataset.data;
let n_samples = data.nrows();
let n_features = data.ncols();
if n_samples < 3 || n_features == 0 {
return Err(DatasetsError::ValidationError(
"Dataset too small for advanced analysis".to_string(),
));
}
let complexity_score = self.calculate_complexity_score(data.view())?;
let entropy = self.calculate_dataset_entropy(data.view())?;
let outlier_score = self.calculate_outlier_score(data.view())?;
let interaction_matrix = self.calculate_interaction_matrix(data.view())?;
let normality_assessment = self.assess_normality(data.view())?;
let ml_quality_score = self.predict_ml_quality(data.view())?;
let correlation_insights = self.analyze_correlations(data.view())?;
Ok(AdvancedQualityMetrics {
complexity_score,
entropy,
outlier_score,
interaction_matrix,
normality_assessment,
ml_quality_score,
correlation_insights,
})
}
fn calculate_complexity_score(&self, data: ArrayView2<f64>) -> Result<f64> {
let n_features = data.ncols();
let complexity_scores = (0..n_features)
.into_par_iter()
.map(|i| {
let feature = data.column(i);
self.calculate_feature_complexity(feature)
})
.collect::<Result<Vec<_>>>()?;
let product: f64 = complexity_scores.iter().product();
Ok(product.powf(1.0 / n_features as f64))
}
fn calculate_feature_complexity(&self, feature: ArrayView1<f64>) -> Result<f64> {
let mut values = feature.to_vec();
values.sort_by(|a, b| a.partial_cmp(b).expect("Operation failed"));
let n_bins = ((values.len() as f64).sqrt() as usize).clamp(10, 100);
let min_val = values[0];
let max_val = values[values.len() - 1];
if (max_val - min_val).abs() < f64::EPSILON {
return Ok(0.0); }
let bin_width = (max_val - min_val) / n_bins as f64;
let mut histogram = vec![0; n_bins];
for &value in &values {
let bin_idx = ((value - min_val) / bin_width) as usize;
let bin_idx = bin_idx.min(n_bins - 1);
histogram[bin_idx] += 1;
}
let n_total = values.len() as f64;
let entropy = histogram
.iter()
.filter(|&&count| count > 0)
.map(|&count| {
let p = count as f64 / n_total;
-p * p.ln()
})
.sum::<f64>();
let max_entropy = (n_bins as f64).ln();
Ok(entropy / max_entropy)
}
fn calculate_dataset_entropy(&self, data: ArrayView2<f64>) -> Result<f64> {
let n_features = data.ncols();
let feature_entropies: Vec<f64> = (0..n_features)
.into_par_iter()
.map(|i| {
let feature = data.column(i);
self.calculate_feature_complexity(feature).unwrap_or(0.0)
})
.collect();
let mean_entropy = feature_entropies.iter().sum::<f64>() / n_features as f64;
let mutual_info_correction = self.estimate_mutual_information(data)?;
Ok((mean_entropy * n_features as f64 - mutual_info_correction).max(0.0))
}
fn estimate_mutual_information(&self, data: ArrayView2<f64>) -> Result<f64> {
let n_features = data.ncols();
if n_features < 2 {
return Ok(0.0);
}
let max_pairs = 100; let step = ((n_features * (n_features - 1) / 2) / max_pairs).max(1);
let mut total_mi = 0.0;
let mut pair_count = 0;
for i in (0..n_features).step_by(step) {
for j in (i + 1..n_features).step_by(step) {
let mi = self.calculate_mutual_information(data.column(i), data.column(j))?;
total_mi += mi;
pair_count += 1;
}
}
Ok(if pair_count > 0 {
total_mi / pair_count as f64
} else {
0.0
})
}
fn calculate_mutual_information(&self, x: ArrayView1<f64>, y: ArrayView1<f64>) -> Result<f64> {
let n_bins = 20;
let x_min = x.iter().fold(f64::INFINITY, |a, &b| a.min(b));
let x_max = x.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
let y_min = y.iter().fold(f64::INFINITY, |a, &b| a.min(b));
let y_max = y.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
if (x_max - x_min).abs() < f64::EPSILON || (y_max - y_min).abs() < f64::EPSILON {
return Ok(0.0);
}
let x_bin_width = (x_max - x_min) / n_bins as f64;
let y_bin_width = (y_max - y_min) / n_bins as f64;
let mut joint_hist = vec![vec![0; n_bins]; n_bins];
let mut x_hist = vec![0; n_bins];
let mut y_hist = vec![0; n_bins];
let n_samples = x.len();
for i in 0..n_samples {
let x_bin = ((x[i] - x_min) / x_bin_width) as usize;
let y_bin = ((y[i] - y_min) / y_bin_width) as usize;
let x_bin = x_bin.min(n_bins - 1);
let y_bin = y_bin.min(n_bins - 1);
joint_hist[x_bin][y_bin] += 1;
x_hist[x_bin] += 1;
y_hist[y_bin] += 1;
}
let n_total = n_samples as f64;
let mut mi = 0.0;
for i in 0..n_bins {
for (j, _) in y_hist.iter().enumerate().take(n_bins) {
if joint_hist[i][j] > 0 && x_hist[i] > 0 && y_hist[j] > 0 {
let p_xy = joint_hist[i][j] as f64 / n_total;
let p_x = x_hist[i] as f64 / n_total;
let p_y = y_hist[j] as f64 / n_total;
mi += p_xy * (p_xy / (p_x * p_y)).ln();
}
}
}
Ok(mi.max(0.0))
}
fn calculate_outlier_score(&self, data: ArrayView2<f64>) -> Result<f64> {
let n_samples = data.nrows();
if n_samples < 3 {
return Ok(0.0);
}
let mean = data.mean_axis(Axis(0)).expect("Operation failed");
let cov_matrix = self.calculate_covariance_matrix(data, &mean)?;
let distances: Vec<f64> = (0..n_samples)
.into_par_iter()
.map(|i| {
let sample = data.row(i);
self.mahalanobis_distance(sample, &mean, &cov_matrix)
.unwrap_or(0.0)
})
.collect();
let mean_distance = distances.iter().sum::<f64>() / distances.len() as f64;
let distance_std = {
let variance = distances
.iter()
.map(|&d| (d - mean_distance).powi(2))
.sum::<f64>()
/ distances.len() as f64;
variance.sqrt()
};
let threshold = mean_distance + 3.0 * distance_std;
let outlier_count = distances.iter().filter(|&&d| d > threshold).count();
Ok(outlier_count as f64 / n_samples as f64)
}
fn calculate_covariance_matrix(
&self,
data: ArrayView2<f64>,
mean: &Array1<f64>,
) -> Result<Array2<f64>> {
let n_samples = data.nrows();
let n_features = data.ncols();
let mut cov_matrix = Array2::zeros((n_features, n_features));
for i in 0..n_features {
for j in i..n_features {
let mut covariance = 0.0;
for k in 0..n_samples {
covariance += (data[[k, i]] - mean[i]) * (data[[k, j]] - mean[j]);
}
covariance /= (n_samples - 1) as f64;
cov_matrix[[i, j]] = covariance;
if i != j {
cov_matrix[[j, i]] = covariance;
}
}
}
Ok(cov_matrix)
}
fn mahalanobis_distance(
&self,
sample: ArrayView1<f64>,
mean: &Array1<f64>,
cov_matrix: &Array2<f64>,
) -> Result<f64> {
let diff = &(sample.to_owned() - mean);
let mut distance_squared = 0.0;
for i in 0..diff.len() {
let variance = cov_matrix[[i, i]];
if variance > f64::EPSILON {
distance_squared += diff[i].powi(2) / variance;
}
}
Ok(distance_squared.sqrt())
}
fn calculate_interaction_matrix(&self, data: ArrayView2<f64>) -> Result<Array2<f64>> {
let n_features = data.ncols();
let mut interaction_matrix = Array2::zeros((n_features, n_features));
for i in 0..n_features {
for j in i..n_features {
let interaction = if i == j {
1.0 } else {
self.calculate_mutual_information(data.column(i), data.column(j))?
};
interaction_matrix[[i, j]] = interaction;
interaction_matrix[[j, i]] = interaction;
}
}
Ok(interaction_matrix)
}
fn assess_normality(&self, data: ArrayView2<f64>) -> Result<NormalityAssessment> {
let n_features = data.ncols();
let shapiro_wilk_scores = Array1::from_vec(
(0..n_features)
.into_par_iter()
.map(|i| self.shapiro_wilk_test(data.column(i)))
.collect::<Result<Vec<_>>>()?,
);
let anderson_darling_scores = Array1::from_vec(
(0..n_features)
.into_par_iter()
.map(|i| self.anderson_darling_test(data.column(i)))
.collect::<Result<Vec<_>>>()?,
);
let jarque_bera_scores = Array1::from_vec(
(0..n_features)
.into_par_iter()
.map(|i| self.jarque_bera_test(data.column(i)))
.collect::<Result<Vec<_>>>()?,
);
let overall_normality = {
let mean_shapiro = {
let val = shapiro_wilk_scores.view().mean();
if val.is_nan() {
0.0
} else {
val
}
};
let mean_anderson = {
let val = anderson_darling_scores.view().mean();
if val.is_nan() {
0.0
} else {
val
}
};
let mean_jarque = {
let val = jarque_bera_scores.view().mean();
if val.is_nan() {
0.0
} else {
val
}
};
(mean_shapiro * 0.4 + mean_anderson * 0.3 + mean_jarque * 0.3).clamp(0.0, 1.0)
};
Ok(NormalityAssessment {
shapiro_wilk_scores,
anderson_darling_scores,
jarque_bera_scores,
overall_normality,
})
}
fn shapiro_wilk_test(&self, data: ArrayView1<f64>) -> Result<f64> {
let n = data.len();
if n < 3 {
return Ok(0.0);
}
let mut sorted_data = data.to_vec();
sorted_data.sort_by(|a, b| a.partial_cmp(b).expect("Operation failed"));
let mean = {
let val = data.mean();
if val.is_nan() {
0.0
} else {
val
}
};
let variance = data.var(1.0);
if variance <= f64::EPSILON {
return Ok(1.0); }
let std_dev = variance.sqrt();
let skewness = data
.iter()
.map(|&x| ((x - mean) / std_dev).powi(3))
.sum::<f64>()
/ n as f64;
let kurtosis = data
.iter()
.map(|&x| ((x - mean) / std_dev).powi(4))
.sum::<f64>()
/ n as f64
- 3.0;
let skewness_score = (-skewness.abs()).exp();
let kurtosis_score = (-kurtosis.abs()).exp();
Ok((skewness_score + kurtosis_score) / 2.0)
}
fn anderson_darling_test(&self, data: ArrayView1<f64>) -> Result<f64> {
let shapiro_score = self.shapiro_wilk_test(data)?;
let n = data.len() as f64;
let adjustment = (1.0 / (1.0 + n / 100.0)).max(0.8);
Ok(shapiro_score * adjustment)
}
fn jarque_bera_test(&self, data: ArrayView1<f64>) -> Result<f64> {
let n = data.len();
if n < 3 {
return Ok(0.0);
}
let mean = {
let val = data.mean();
if val.is_nan() {
0.0
} else {
val
}
};
let variance = data.var(1.0);
if variance <= f64::EPSILON {
return Ok(1.0);
}
let std_dev = variance.sqrt();
let skewness = data
.iter()
.map(|&x| ((x - mean) / std_dev).powi(3))
.sum::<f64>()
/ n as f64;
let kurtosis = data
.iter()
.map(|&x| ((x - mean) / std_dev).powi(4))
.sum::<f64>()
/ n as f64
- 3.0;
let jb_stat = (n as f64 / 6.0) * (skewness.powi(2) + kurtosis.powi(2) / 4.0);
Ok((-jb_stat / 10.0).exp())
}
fn predict_ml_quality(&self, data: ArrayView2<f64>) -> Result<f64> {
let n_samples = data.nrows();
let n_features = data.ncols();
if n_samples < 10 || n_features == 0 {
return Ok(0.1); }
let size_factor = (n_samples as f64 / (n_samples as f64 + 100.0)).min(1.0);
let dimensionality_factor = (n_features as f64 / (n_features as f64 + 50.0)).min(1.0);
let missing_rate = self.calculate_missing_rate(data);
let completeness_factor = 1.0 - missing_rate;
let variance_factor = self.calculate_variance_quality(data)?;
let quality_score = (size_factor * 0.25
+ dimensionality_factor * 0.15
+ completeness_factor * 0.35
+ variance_factor * 0.25)
.clamp(0.0, 1.0);
Ok(quality_score)
}
fn calculate_missing_rate(&self, data: ArrayView2<f64>) -> f64 {
let total_elements = data.len();
let missing_count = data
.iter()
.filter(|&&x| x.is_nan() || x.is_infinite())
.count();
missing_count as f64 / total_elements as f64
}
fn calculate_variance_quality(&self, data: ArrayView2<f64>) -> Result<f64> {
let n_features = data.ncols();
if n_features == 0 {
return Ok(0.0);
}
let variances: Vec<f64> = (0..n_features).map(|i| data.column(i).var(1.0)).collect();
let mean_variance = variances.iter().sum::<f64>() / n_features as f64;
if mean_variance <= f64::EPSILON {
return Ok(0.1); }
let variance_cv = {
let variance_of_variances = variances
.iter()
.map(|&v| (v - mean_variance).powi(2))
.sum::<f64>()
/ n_features as f64;
variance_of_variances.sqrt() / mean_variance
};
Ok((1.0 / (1.0 + variance_cv)).max(0.1))
}
fn analyze_correlations(&self, data: ArrayView2<f64>) -> Result<CorrelationInsights> {
let n_features = data.ncols();
let linear_correlations = self.calculate_correlation_matrix(data)?;
let nonlinear_correlations = self.calculate_interaction_matrix(data)?;
let causality_hints = self.estimate_causality_matrix(data)?;
let feature_importance = Array1::from_vec(
(0..n_features)
.map(|i| {
let mut total_correlation = 0.0;
for j in 0..n_features {
if i != j {
total_correlation += linear_correlations[[i, j]].abs();
}
}
total_correlation / (n_features - 1) as f64
})
.collect(),
);
Ok(CorrelationInsights {
linear_correlations,
nonlinear_correlations,
causality_hints,
feature_importance,
})
}
fn calculate_correlation_matrix(&self, data: ArrayView2<f64>) -> Result<Array2<f64>> {
let n_features = data.ncols();
let mut corr_matrix = Array2::zeros((n_features, n_features));
for i in 0..n_features {
for j in i..n_features {
let correlation = if i == j {
1.0
} else {
self.pearson_correlation(data.column(i), data.column(j))?
};
corr_matrix[[i, j]] = correlation;
corr_matrix[[j, i]] = correlation;
}
}
Ok(corr_matrix)
}
fn pearson_correlation(&self, x: ArrayView1<f64>, y: ArrayView1<f64>) -> Result<f64> {
let n = x.len();
if n != y.len() || n < 2 {
return Ok(0.0);
}
let mean_x = {
let val = x.mean();
if val.is_nan() {
0.0
} else {
val
}
};
let mean_y = {
let val = y.mean();
if val.is_nan() {
0.0
} else {
val
}
};
let mut numerator = 0.0;
let mut sum_sq_x = 0.0;
let mut sum_sq_y = 0.0;
for i in 0..n {
let dx = x[i] - mean_x;
let dy = y[i] - mean_y;
numerator += dx * dy;
sum_sq_x += dx * dx;
sum_sq_y += dy * dy;
}
let denominator = (sum_sq_x * sum_sq_y).sqrt();
if denominator <= f64::EPSILON {
Ok(0.0)
} else {
Ok(numerator / denominator)
}
}
fn estimate_causality_matrix(&self, data: ArrayView2<f64>) -> Result<Array2<f64>> {
let n_features = data.ncols();
let mut causality_matrix = Array2::zeros((n_features, n_features));
for i in 0..n_features {
for j in 0..n_features {
if i != j {
let correlation = self.pearson_correlation(data.column(i), data.column(j))?;
causality_matrix[[i, j]] = correlation.abs() * 0.5; }
}
}
Ok(causality_matrix)
}
}
#[allow(dead_code)]
pub fn analyze_dataset_advanced(dataset: &Dataset) -> Result<AdvancedQualityMetrics> {
let analyzer = AdvancedDatasetAnalyzer::new();
analyzer.analyze_dataset_quality(dataset)
}
#[allow(dead_code)]
pub fn quick_quality_assessment(dataset: &Dataset) -> Result<f64> {
let analyzer = AdvancedDatasetAnalyzer::new().with_advanced_precision(false);
let metrics = analyzer.analyze_dataset_quality(dataset)?;
Ok(metrics.ml_quality_score)
}
#[cfg(test)]
mod tests {
use super::*;
use scirs2_core::ndarray::Array2;
#[allow(dead_code)]
fn create_test_dataset() -> Dataset {
let data = Array2::from_shape_vec((100, 3), (0..300).map(|x| x as f64).collect())
.expect("Operation failed");
let target = Array1::from_vec((0..100).map(|x| (x % 2) as f64).collect());
Dataset::new(data, Some(target))
}
#[test]
fn test_advanced_analyzer_creation() {
let analyzer = AdvancedDatasetAnalyzer::new();
assert!(analyzer.use_gpu);
assert!(analyzer.advanced_precision);
}
#[test]
fn test_quick_quality_assessment() {
let dataset = create_test_dataset();
let quality = quick_quality_assessment(&dataset);
assert!(quality.is_ok());
let quality_score = quality.expect("Operation failed");
assert!((0.0..=1.0).contains(&quality_score));
}
}