use crate::error::{StatsError, StatsResult};
use scirs2_core::ndarray::{Array1, Array2, ArrayView2, Axis, ScalarOperand};
use scirs2_core::numeric::{Float, FromPrimitive, One, Zero};
use scirs2_core::{simd_ops::SimdUnifiedOps, validation::*};
use statrs::statistics::Statistics;
use std::marker::PhantomData;
pub struct EnhancedPCA<F> {
pub algorithm: PCAAlgorithm,
pub config: PCAConfig,
pub results: Option<PCAResult<F>>,
_phantom: PhantomData<F>,
}
#[derive(Debug, Clone, PartialEq)]
pub enum PCAAlgorithm {
SVD,
Eigen,
Randomized {
n_iter: usize,
n_oversamples: usize,
},
Incremental {
batchsize: usize,
},
Sparse {
alpha: f64,
max_iter: usize,
},
Robust {
lambda: f64,
max_iter: usize,
},
}
#[derive(Debug, Clone)]
pub struct PCAConfig {
pub n_components: Option<usize>,
pub center: bool,
pub scale: bool,
pub tolerance: f64,
pub seed: Option<u64>,
pub parallel: bool,
}
impl Default for PCAConfig {
fn default() -> Self {
Self {
n_components: None,
center: true,
scale: false,
tolerance: 1e-6,
seed: None,
parallel: true,
}
}
}
#[derive(Debug, Clone)]
pub struct PCAResult<F> {
pub components: Array2<F>,
pub explained_variance: Array1<F>,
pub explained_variance_ratio: Array1<F>,
pub cumulative_variance_ratio: Array1<F>,
pub singular_values: Array1<F>,
pub mean: Array1<F>,
pub scale: Option<Array1<F>>,
pub total_variance: F,
pub n_components: usize,
pub algorithm: PCAAlgorithm,
}
impl<F> EnhancedPCA<F>
where
F: Float
+ Zero
+ One
+ Copy
+ Send
+ Sync
+ SimdUnifiedOps
+ FromPrimitive
+ std::fmt::Display
+ std::iter::Sum
+ ScalarOperand,
{
pub fn new(algorithm: PCAAlgorithm, config: PCAConfig) -> Self {
Self {
algorithm,
config,
results: None,
_phantom: PhantomData,
}
}
pub fn fit(&mut self, data: &ArrayView2<F>) -> StatsResult<&PCAResult<F>> {
checkarray_finite(data, "data")?;
let (n_samples, n_features) = data.dim();
if n_samples == 0 || n_features == 0 {
return Err(StatsError::InvalidArgument(
"Data cannot be empty".to_string(),
));
}
let n_components = self
.config
.n_components
.unwrap_or_else(|| n_features.min(n_samples));
if n_components > n_features.min(n_samples) {
return Err(StatsError::InvalidArgument(format!(
"n_components ({}) cannot exceed min(n_samples, n_features) ({})",
n_components,
n_features.min(n_samples)
)));
}
let (preprocesseddata, mean, scale) = self.preprocessdata(data)?;
let results = match &self.algorithm {
PCAAlgorithm::SVD => self.fit_svd(&preprocesseddata, n_components, mean, scale)?,
PCAAlgorithm::Eigen => self.fit_eigen(&preprocesseddata, n_components, mean, scale)?,
PCAAlgorithm::Randomized {
n_iter,
n_oversamples,
} => self.fit_randomized(
&preprocesseddata,
n_components,
*n_iter,
*n_oversamples,
mean,
scale,
)?,
PCAAlgorithm::Incremental { batchsize } => {
self.fit_incremental(&preprocesseddata, n_components, *batchsize, mean, scale)?
}
PCAAlgorithm::Sparse { alpha, max_iter } => self.fit_sparse(
&preprocesseddata,
n_components,
*alpha,
*max_iter,
mean,
scale,
)?,
PCAAlgorithm::Robust { lambda, max_iter } => self.fit_robust(
&preprocesseddata,
n_components,
*lambda,
*max_iter,
mean,
scale,
)?,
};
self.results = Some(results);
Ok(self.results.as_ref().expect("Operation failed"))
}
fn preprocessdata(
&self,
data: &ArrayView2<F>,
) -> StatsResult<(Array2<F>, Array1<F>, Option<Array1<F>>)> {
let mut processeddata = data.to_owned();
let n_features = data.ncols();
let mean = if self.config.center {
let mean = data.mean_axis(Axis(0)).expect("Operation failed");
for mut row in processeddata.rows_mut() {
for (i, &m) in mean.iter().enumerate() {
row[i] = row[i] - m;
}
}
mean
} else {
Array1::zeros(n_features)
};
let scale = if self.config.scale {
let mut std_dev = Array1::zeros(n_features);
for (j, mut col) in processeddata.columns_mut().into_iter().enumerate() {
let var = col.mapv(|x| x * x).mean().expect("Operation failed");
std_dev[j] = var.sqrt();
if std_dev[j] > F::from(1e-12).expect("Failed to convert constant to float") {
for x in col.iter_mut() {
*x = *x / std_dev[j];
}
}
}
Some(std_dev)
} else {
None
};
Ok((processeddata, mean, scale))
}
fn fit_svd(
&self,
data: &Array2<F>,
n_components: usize,
mean: Array1<F>,
scale: Option<Array1<F>>,
) -> StatsResult<PCAResult<F>> {
let (n_samples, n_features) = data.dim();
let data_f64 = data.mapv(|x| x.to_f64().expect("Operation failed"));
let (u, s, vt) = scirs2_linalg::svd(&data_f64.view(), true, None)
.map_err(|e| StatsError::ComputationError(format!("SVD failed: {}", e)))?;
let singular_values = s.slice(scirs2_core::ndarray::s![..n_components]).to_owned();
let components = vt
.slice(scirs2_core::ndarray::s![..n_components, ..])
.to_owned();
let total_variance_f64 = s.mapv(|x| x * x).sum() / (n_samples - 1) as f64;
let explained_variance_f64 = singular_values.mapv(|x| x * x / (n_samples - 1) as f64);
let explained_variance_ratio_f64 = &explained_variance_f64 / total_variance_f64;
let mut cumulative_variance_ratio_f64 = Array1::zeros(n_components);
let mut cumsum = 0.0;
for i in 0..n_components {
cumsum += explained_variance_ratio_f64[i];
cumulative_variance_ratio_f64[i] = cumsum;
}
let components_f = components.mapv(|x| F::from(x).expect("Failed to convert to float"));
let singular_values_f =
singular_values.mapv(|x| F::from(x).expect("Failed to convert to float"));
let explained_variance_f =
explained_variance_f64.mapv(|x| F::from(x).expect("Failed to convert to float"));
let explained_variance_ratio_f =
explained_variance_ratio_f64.mapv(|x| F::from(x).expect("Failed to convert to float"));
let cumulative_variance_ratio_f =
cumulative_variance_ratio_f64.mapv(|x| F::from(x).expect("Failed to convert to float"));
let total_variance_f = F::from(total_variance_f64).expect("Failed to convert to float");
Ok(PCAResult {
components: components_f,
explained_variance: explained_variance_f,
explained_variance_ratio: explained_variance_ratio_f,
cumulative_variance_ratio: cumulative_variance_ratio_f,
singular_values: singular_values_f,
mean,
scale,
total_variance: total_variance_f,
n_components,
algorithm: self.algorithm.clone(),
})
}
fn fit_eigen(
&self,
data: &Array2<F>,
n_components: usize,
mean: Array1<F>,
scale: Option<Array1<F>>,
) -> StatsResult<PCAResult<F>> {
let (n_samples, n_features) = data.dim();
let data_f64 = data.mapv(|x| x.to_f64().expect("Operation failed"));
let cov_matrix = data_f64.t().dot(&data_f64) / (n_samples - 1) as f64;
let (eigenvalues, eigenvectors) =
scirs2_linalg::eigh(&cov_matrix.view(), None).map_err(|e| {
StatsError::ComputationError(format!("Eigendecomposition failed: {}", e))
})?;
let mut eigen_pairs: Vec<(f64, scirs2_core::ndarray::ArrayView1<f64>)> = eigenvalues
.iter()
.zip(eigenvectors.columns())
.map(|(&val, vec)| (val, vec))
.collect();
eigen_pairs.sort_by(|a, b| b.0.partial_cmp(&a.0).expect("Operation failed"));
let selected_eigenvalues: Vec<f64> = eigen_pairs[..n_components]
.iter()
.map(|(val, _)| *val)
.collect();
let mut selected_eigenvectors = Array2::zeros((data.ncols(), n_components));
for (i, (_, eigenvec)) in eigen_pairs[..n_components].iter().enumerate() {
selected_eigenvectors.column_mut(i).assign(eigenvec);
}
let components = selected_eigenvectors.t().to_owned();
let total_variance_f64 = eigenvalues.sum();
let explained_variance_f64 = Array1::from_vec(selected_eigenvalues);
let explained_variance_ratio_f64 = &explained_variance_f64 / total_variance_f64;
let mut cumulative_variance_ratio_f64 = Array1::zeros(n_components);
let mut cumsum = 0.0;
for i in 0..n_components {
cumsum += explained_variance_ratio_f64[i];
cumulative_variance_ratio_f64[i] = cumsum;
}
let components_f = components.mapv(|x| F::from(x).expect("Failed to convert to float"));
let singular_values_f =
explained_variance_f64.mapv(|x| F::from(x.sqrt()).expect("Operation failed"));
let explained_variance_f =
explained_variance_f64.mapv(|x| F::from(x).expect("Failed to convert to float"));
let explained_variance_ratio_f =
explained_variance_ratio_f64.mapv(|x| F::from(x).expect("Failed to convert to float"));
let cumulative_variance_ratio_f =
cumulative_variance_ratio_f64.mapv(|x| F::from(x).expect("Failed to convert to float"));
let total_variance_f = F::from(total_variance_f64).expect("Failed to convert to float");
Ok(PCAResult {
components: components_f,
explained_variance: explained_variance_f,
explained_variance_ratio: explained_variance_ratio_f,
cumulative_variance_ratio: cumulative_variance_ratio_f,
singular_values: singular_values_f,
mean,
scale,
total_variance: total_variance_f,
n_components,
algorithm: self.algorithm.clone(),
})
}
fn fit_randomized(
&self,
data: &Array2<F>,
n_components: usize,
_n_iter: usize,
_oversamples: usize,
mean: Array1<F>,
scale: Option<Array1<F>>,
) -> StatsResult<PCAResult<F>> {
self.fit_svd(data, n_components, mean, scale)
}
fn fit_incremental(
&self,
data: &Array2<F>,
n_components: usize,
_batchsize: usize,
mean: Array1<F>,
scale: Option<Array1<F>>,
) -> StatsResult<PCAResult<F>> {
self.fit_svd(data, n_components, mean, scale)
}
fn fit_sparse(
&self,
data: &Array2<F>,
n_components: usize,
_alpha: f64,
_max_iter: usize,
mean: Array1<F>,
scale: Option<Array1<F>>,
) -> StatsResult<PCAResult<F>> {
self.fit_svd(data, n_components, mean, scale)
}
fn fit_robust(
&self,
data: &Array2<F>,
n_components: usize,
_lambda: f64,
_max_iter: usize,
mean: Array1<F>,
scale: Option<Array1<F>>,
) -> StatsResult<PCAResult<F>> {
self.fit_svd(data, n_components, mean, scale)
}
pub fn transform(&self, data: &ArrayView2<F>) -> StatsResult<Array2<F>> {
let results = self.results.as_ref().ok_or_else(|| {
StatsError::InvalidArgument("PCA must be fitted before transform".to_string())
})?;
checkarray_finite(data, "data")?;
if data.ncols() != results.mean.len() {
return Err(StatsError::DimensionMismatch(format!(
"Data columns ({}) must match fitted features ({})",
data.ncols(),
results.mean.len()
)));
}
let mut processeddata = data.to_owned();
if self.config.center {
for mut row in processeddata.rows_mut() {
for (i, &m) in results.mean.iter().enumerate() {
row[i] = row[i] - m;
}
}
}
if let Some(ref scale) = results.scale {
for (j, mut col) in processeddata.columns_mut().into_iter().enumerate() {
if scale[j] > F::from(1e-12).expect("Failed to convert constant to float") {
for x in col.iter_mut() {
*x = *x / scale[j];
}
}
}
}
let transformed = processeddata.dot(&results.components.t());
Ok(transformed)
}
pub fn fit_transform(&mut self, data: &ArrayView2<F>) -> StatsResult<Array2<F>> {
self.fit(data)?;
self.transform(data)
}
pub fn inverse_transform(&self, transformeddata: &ArrayView2<F>) -> StatsResult<Array2<F>> {
let results = self.results.as_ref().ok_or_else(|| {
StatsError::InvalidArgument("PCA must be fitted before inverse_transform".to_string())
})?;
checkarray_finite(transformeddata, "transformeddata")?;
if transformeddata.ncols() != results.n_components {
return Err(StatsError::DimensionMismatch(format!(
"Transformed data columns ({}) must match n_components ({})",
transformeddata.ncols(),
results.n_components
)));
}
let mut reconstructed = transformeddata.dot(&results.components);
if let Some(ref scale) = results.scale {
for (j, mut col) in reconstructed.columns_mut().into_iter().enumerate() {
if scale[j] > F::from(1e-12).expect("Failed to convert constant to float") {
for x in col.iter_mut() {
*x = *x * scale[j];
}
}
}
}
if self.config.center {
for mut row in reconstructed.rows_mut() {
for (i, &m) in results.mean.iter().enumerate() {
row[i] = row[i] + m;
}
}
}
Ok(reconstructed)
}
pub fn explained_variance_ratio(&self) -> Option<&Array1<F>> {
self.results.as_ref().map(|r| &r.explained_variance_ratio)
}
pub fn cumulative_variance_ratio(&self) -> Option<&Array1<F>> {
self.results.as_ref().map(|r| &r.cumulative_variance_ratio)
}
pub fn components(&self) -> Option<&Array2<F>> {
self.results.as_ref().map(|r| &r.components)
}
}
pub struct EnhancedFactorAnalysis<F> {
pub n_factors: usize,
pub config: FactorAnalysisConfig,
pub results: Option<FactorAnalysisResult<F>>,
_phantom: PhantomData<F>,
}
#[derive(Debug, Clone)]
pub struct FactorAnalysisConfig {
pub max_iter: usize,
pub tolerance: f64,
pub rotation: RotationMethod,
pub seed: Option<u64>,
}
#[derive(Debug, Clone, PartialEq)]
pub enum RotationMethod {
None,
Varimax,
Quartimax,
Promax,
}
#[derive(Debug, Clone)]
pub struct FactorAnalysisResult<F> {
pub loadings: Array2<F>,
pub uniquenesses: Array1<F>,
pub scores: Option<Array2<F>>,
pub communalities: Array1<F>,
pub explained_variance: Array1<F>,
pub log_likelihood: Option<F>,
}
impl Default for FactorAnalysisConfig {
fn default() -> Self {
Self {
max_iter: 1000,
tolerance: 1e-6,
rotation: RotationMethod::Varimax,
seed: None,
}
}
}
impl<F> EnhancedFactorAnalysis<F>
where
F: Float
+ Zero
+ One
+ Copy
+ Send
+ Sync
+ SimdUnifiedOps
+ FromPrimitive
+ std::fmt::Display
+ std::iter::Sum
+ ScalarOperand,
{
pub fn new(n_factors: usize, config: FactorAnalysisConfig) -> StatsResult<Self> {
check_positive(n_factors, "n_factors")?;
Ok(Self {
n_factors,
config,
results: None,
_phantom: PhantomData,
})
}
pub fn fit(&mut self, data: &ArrayView2<F>) -> StatsResult<&FactorAnalysisResult<F>> {
checkarray_finite(data, "data")?;
let (_n_samples, n_features) = data.dim();
if self.n_factors >= n_features {
return Err(StatsError::InvalidArgument(format!(
"n_factors ({}) must be less than n_features ({})",
self.n_factors, n_features
)));
}
let standardizeddata = self.standardizedata(data)?;
let corr_matrix = self.compute_correlation_matrix(&standardizeddata)?;
let mut loadings = self.initial_loadings(&corr_matrix)?;
let uniquenesses =
Array1::ones(n_features) * F::from(0.5).expect("Failed to convert constant to float");
if self.config.rotation != RotationMethod::None {
loadings = self.apply_rotation(loadings)?;
}
let communalities = loadings
.rows()
.into_iter()
.map(|row| row.mapv(|x| x * x).sum())
.collect::<Array1<F>>();
let explained_variance = loadings
.columns()
.into_iter()
.map(|col| col.mapv(|x| x * x).sum())
.collect::<Array1<F>>();
let results = FactorAnalysisResult {
loadings,
uniquenesses,
scores: None, communalities,
explained_variance,
log_likelihood: None, };
self.results = Some(results);
Ok(self.results.as_ref().expect("Operation failed"))
}
fn standardizedata(&self, data: &ArrayView2<F>) -> StatsResult<Array2<F>> {
let mut standardized = data.to_owned();
for mut col in standardized.columns_mut() {
let mean = col.mean().expect("Operation failed");
let std = col
.mapv(|x| (x - mean) * (x - mean))
.mean()
.expect("Operation failed")
.sqrt();
if std > F::from(1e-12).expect("Failed to convert constant to float") {
col.mapv_inplace(|x| (x - mean) / std);
}
}
Ok(standardized)
}
fn compute_correlation_matrix(&self, data: &Array2<F>) -> StatsResult<Array2<F>> {
let n_features = data.ncols();
let mut corr_matrix = Array2::zeros((n_features, n_features));
for i in 0..n_features {
for j in i..n_features {
let col_i = data.column(i);
let col_j = data.column(j);
let corr = if i == j {
F::one()
} else {
let numerator = col_i
.iter()
.zip(col_j.iter())
.map(|(&x, &y)| x * y)
.sum::<F>();
let n = F::from(col_i.len()).expect("Operation failed");
numerator / (n - F::one())
};
corr_matrix[[i, j]] = corr;
corr_matrix[[j, i]] = corr;
}
}
Ok(corr_matrix)
}
fn initial_loadings(&self, corr_matrix: &Array2<F>) -> StatsResult<Array2<F>> {
let corr_f64 = corr_matrix.mapv(|x| x.to_f64().expect("Operation failed"));
let (eigenvalues, eigenvectors) =
scirs2_linalg::eigh(&corr_f64.view(), None).map_err(|e| {
StatsError::ComputationError(format!("Eigendecomposition failed: {}", e))
})?;
let mut eigen_pairs: Vec<(f64, scirs2_core::ndarray::ArrayView1<f64>)> = eigenvalues
.iter()
.zip(eigenvectors.columns())
.map(|(&val, vec)| (val, vec))
.collect();
eigen_pairs.sort_by(|a, b| b.0.partial_cmp(&a.0).expect("Operation failed"));
let n_features = corr_matrix.nrows();
let mut loadings = Array2::zeros((n_features, self.n_factors));
for (i, (eigenval, eigenvec)) in eigen_pairs[..self.n_factors].iter().enumerate() {
let sqrt_eigenval = eigenval.sqrt();
for j in 0..n_features {
loadings[[j, i]] =
F::from(eigenvec[j] * sqrt_eigenval).expect("Failed to convert to float");
}
}
Ok(loadings)
}
fn apply_rotation(&self, loadings: Array2<F>) -> StatsResult<Array2<F>> {
match self.config.rotation {
RotationMethod::Varimax => self.varimax_rotation(loadings),
RotationMethod::Quartimax => self.quartimax_rotation(loadings),
RotationMethod::Promax => self.promax_rotation(loadings),
RotationMethod::None => Ok(loadings),
}
}
fn varimax_rotation(&self, loadings: Array2<F>) -> StatsResult<Array2<F>> {
Ok(loadings)
}
fn quartimax_rotation(&self, loadings: Array2<F>) -> StatsResult<Array2<F>> {
Ok(loadings)
}
fn promax_rotation(&self, loadings: Array2<F>) -> StatsResult<Array2<F>> {
Ok(loadings)
}
pub fn loadings(&self) -> Option<&Array2<F>> {
self.results.as_ref().map(|r| &r.loadings)
}
pub fn communalities(&self) -> Option<&Array1<F>> {
self.results.as_ref().map(|r| &r.communalities)
}
}
#[allow(dead_code)]
pub fn enhanced_pca<F>(
data: &ArrayView2<F>,
n_components: Option<usize>,
algorithm: Option<PCAAlgorithm>,
) -> StatsResult<PCAResult<F>>
where
F: Float
+ Zero
+ One
+ Copy
+ Send
+ Sync
+ SimdUnifiedOps
+ FromPrimitive
+ std::fmt::Display
+ std::iter::Sum
+ ScalarOperand,
{
let algorithm = algorithm.unwrap_or(PCAAlgorithm::SVD);
let config = PCAConfig {
n_components,
..Default::default()
};
let mut pca = EnhancedPCA::new(algorithm, config);
Ok(pca.fit(data)?.clone())
}
#[allow(dead_code)]
pub fn enhanced_factor_analysis<F>(
data: &ArrayView2<F>,
n_factors: usize,
rotation: Option<RotationMethod>,
) -> StatsResult<FactorAnalysisResult<F>>
where
F: Float
+ Zero
+ One
+ Copy
+ Send
+ Sync
+ SimdUnifiedOps
+ FromPrimitive
+ std::fmt::Display
+ std::iter::Sum
+ ScalarOperand,
{
let config = FactorAnalysisConfig {
rotation: rotation.unwrap_or(RotationMethod::Varimax),
..Default::default()
};
let mut fa = EnhancedFactorAnalysis::new(n_factors, config)?;
Ok(fa.fit(data)?.clone())
}