use crate::Dataset;
use tenflowers_core::{Result, TensorError};
pub struct AdvancedStatistics;
impl AdvancedStatistics {
pub fn compute_multivariate_stats<T, D>(dataset: &D) -> Result<MultivariateStatistics<T>>
where
T: Clone
+ Default
+ scirs2_core::numeric::Float
+ Send
+ Sync
+ bytemuck::Pod
+ bytemuck::Zeroable
+ 'static,
D: Dataset<T>,
{
let sample_count = dataset.len();
if sample_count == 0 {
return Err(TensorError::InvalidShape {
operation: "AdvancedStatistics::compute_multivariate_stats".to_string(),
reason: "Empty dataset".to_string(),
shape: Some(vec![0]),
context: None,
});
}
let (first_features, _) = dataset.get(0)?;
let feature_dims = first_features.shape().dims();
let n_features = if feature_dims.is_empty() {
1
} else {
feature_dims.iter().product()
};
let mut all_features = Vec::with_capacity(sample_count * n_features);
for i in 0..sample_count {
let (features, _) = dataset.get(i)?;
let feature_vec = features.to_vec()?;
all_features.extend_from_slice(&feature_vec);
}
let covariance_matrix =
Self::compute_covariance_matrix(&all_features, n_features, sample_count)?;
let eigenvalues = Self::compute_eigenvalues(&covariance_matrix)?;
let skewness = Self::compute_skewness(&all_features, n_features, sample_count)?;
let kurtosis = Self::compute_kurtosis(&all_features, n_features, sample_count)?;
Ok(MultivariateStatistics {
covariance_matrix,
eigenvalues,
skewness,
kurtosis,
n_features,
n_samples: sample_count,
})
}
fn compute_covariance_matrix<T>(
data: &[T],
n_features: usize,
n_samples: usize,
) -> Result<Vec<Vec<T>>>
where
T: Clone + Default + scirs2_core::numeric::Float,
{
let mut means = vec![T::zero(); n_features];
for sample_idx in 0..n_samples {
for feat_idx in 0..n_features {
let data_idx = sample_idx * n_features + feat_idx;
means[feat_idx] = means[feat_idx] + data[data_idx].clone();
}
}
for mean in &mut means {
*mean =
mean.clone() / T::from(n_samples).expect("sample count should convert to float");
}
let mut cov_matrix = vec![vec![T::zero(); n_features]; n_features];
for i in 0..n_features {
for j in 0..n_features {
let mut covariance = T::zero();
for sample_idx in 0..n_samples {
let i_idx = sample_idx * n_features + i;
let j_idx = sample_idx * n_features + j;
let dev_i = data[i_idx].clone() - means[i].clone();
let dev_j = data[j_idx].clone() - means[j].clone();
covariance = covariance + dev_i * dev_j;
}
cov_matrix[i][j] = covariance
/ T::from(n_samples - 1).expect("sample count should convert to float");
}
}
Ok(cov_matrix)
}
fn compute_eigenvalues<T>(matrix: &[Vec<T>]) -> Result<Vec<T>>
where
T: Clone + Default + scirs2_core::numeric::Float,
{
let n = matrix.len();
let mut eigenvalues = Vec::new();
for i in 0..n {
eigenvalues.push(matrix[i][i].clone());
}
Ok(eigenvalues)
}
fn compute_skewness<T>(data: &[T], n_features: usize, n_samples: usize) -> Result<Vec<T>>
where
T: Clone + Default + scirs2_core::numeric::Float,
{
let mut skewness_values = vec![T::zero(); n_features];
for feat_idx in 0..n_features {
let mut feature_values = Vec::with_capacity(n_samples);
for sample_idx in 0..n_samples {
let data_idx = sample_idx * n_features + feat_idx;
feature_values.push(data[data_idx].clone());
}
let mean = feature_values.iter().fold(T::zero(), |acc, &x| acc + x)
/ T::from(n_samples).expect("sample count should convert to float");
let variance = feature_values
.iter()
.map(|&x| {
let dev = x - mean.clone();
dev.clone() * dev
})
.fold(T::zero(), |acc, x| acc + x)
/ T::from(n_samples).expect("sample count should convert to float");
let std_dev = variance.sqrt();
if std_dev > T::zero() {
let skew = feature_values
.iter()
.map(|&x| {
let normalized = (x - mean.clone()) / std_dev.clone();
normalized.clone() * normalized.clone() * normalized
})
.fold(T::zero(), |acc, x| acc + x)
/ T::from(n_samples).expect("sample count should convert to float");
skewness_values[feat_idx] = skew;
}
}
Ok(skewness_values)
}
fn compute_kurtosis<T>(data: &[T], n_features: usize, n_samples: usize) -> Result<Vec<T>>
where
T: Clone + Default + scirs2_core::numeric::Float,
{
let mut kurtosis_values = vec![T::zero(); n_features];
for feat_idx in 0..n_features {
let mut feature_values = Vec::with_capacity(n_samples);
for sample_idx in 0..n_samples {
let data_idx = sample_idx * n_features + feat_idx;
feature_values.push(data[data_idx].clone());
}
let mean = feature_values.iter().fold(T::zero(), |acc, &x| acc + x)
/ T::from(n_samples).expect("sample count should convert to float");
let variance = feature_values
.iter()
.map(|&x| {
let dev = x - mean.clone();
dev.clone() * dev
})
.fold(T::zero(), |acc, x| acc + x)
/ T::from(n_samples).expect("sample count should convert to float");
let std_dev = variance.sqrt();
if std_dev > T::zero() {
let kurt = feature_values
.iter()
.map(|&x| {
let normalized = (x - mean.clone()) / std_dev.clone();
let norm_squared = normalized.clone() * normalized.clone();
norm_squared.clone() * norm_squared
})
.fold(T::zero(), |acc, x| acc + x)
/ T::from(n_samples).expect("sample count should convert to float");
kurtosis_values[feat_idx] =
kurt - T::from(3.0).expect("constant 3.0 should convert to float");
}
}
Ok(kurtosis_values)
}
pub fn compute_pca<T, D>(dataset: &D, n_components: usize) -> Result<PCAResult<T>>
where
T: Clone
+ Default
+ scirs2_core::numeric::Float
+ Send
+ Sync
+ bytemuck::Pod
+ bytemuck::Zeroable
+ 'static,
D: Dataset<T>,
{
let sample_count = dataset.len();
if sample_count == 0 {
return Err(TensorError::InvalidShape {
operation: "AdvancedStatistics::compute_pca".to_string(),
reason: "Empty dataset".to_string(),
shape: Some(vec![0]),
context: None,
});
}
let multivariate_stats: MultivariateStatistics<T> =
Self::compute_multivariate_stats(dataset)?;
let eigenvalues = multivariate_stats.eigenvalues;
let n_features = multivariate_stats.n_features;
let selected_components = std::cmp::min(n_components, eigenvalues.len());
let mut sorted_eigenvalues = eigenvalues.clone();
sorted_eigenvalues.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
let principal_components = sorted_eigenvalues[..selected_components].to_vec();
let explained_variance_ratio = Self::compute_explained_variance(&principal_components)?;
Ok(PCAResult {
principal_components,
explained_variance_ratio,
n_components: selected_components,
n_features,
})
}
fn compute_explained_variance<T>(eigenvalues: &[T]) -> Result<Vec<T>>
where
T: Clone + Default + scirs2_core::numeric::Float,
{
let total_variance = eigenvalues.iter().fold(T::zero(), |acc, &x| acc + x);
if total_variance <= T::zero() {
return Ok(vec![T::zero(); eigenvalues.len()]);
}
let ratios = eigenvalues
.iter()
.map(|&x| x / total_variance.clone())
.collect();
Ok(ratios)
}
}
#[derive(Debug, Clone)]
pub struct MultivariateStatistics<T> {
pub covariance_matrix: Vec<Vec<T>>,
pub eigenvalues: Vec<T>,
pub skewness: Vec<T>,
pub kurtosis: Vec<T>,
pub n_features: usize,
pub n_samples: usize,
}
#[derive(Debug, Clone)]
pub struct PCAResult<T> {
pub principal_components: Vec<T>,
pub explained_variance_ratio: Vec<T>,
pub n_components: usize,
pub n_features: usize,
}
pub trait AdvancedStatisticsExt<T> {
fn compute_multivariate_statistics(&self) -> Result<MultivariateStatistics<T>>;
fn compute_pca(&self, n_components: usize) -> Result<PCAResult<T>>;
}
impl<T, D> AdvancedStatisticsExt<T> for D
where
T: Clone
+ Default
+ scirs2_core::numeric::Float
+ Send
+ Sync
+ bytemuck::Pod
+ bytemuck::Zeroable
+ 'static,
D: Dataset<T>,
{
fn compute_multivariate_statistics(&self) -> Result<MultivariateStatistics<T>> {
AdvancedStatistics::compute_multivariate_stats(self)
}
fn compute_pca(&self, n_components: usize) -> Result<PCAResult<T>> {
AdvancedStatistics::compute_pca(self, n_components)
}
}