use crate::error::{StatsError, StatsResult as Result};
use scirs2_core::ndarray::{Array1, Array2, ArrayView2, Axis};
use scirs2_core::random::{rngs::StdRng, SeedableRng};
use scirs2_core::validation::*;
#[derive(Debug, Clone)]
pub struct FactorAnalysis {
pub n_factors: usize,
pub max_iter: usize,
pub tol: f64,
pub rotation: RotationType,
pub random_state: Option<u64>,
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum RotationType {
None,
Varimax,
Promax,
}
#[derive(Debug, Clone)]
pub struct FactorAnalysisResult {
pub loadings: Array2<f64>,
pub noise_variance: Array1<f64>,
pub scores: Array2<f64>,
pub mean: Array1<f64>,
pub log_likelihood: f64,
pub n_iter: usize,
pub explained_variance_ratio: Array1<f64>,
pub communalities: Array1<f64>,
}
impl Default for FactorAnalysis {
fn default() -> Self {
Self {
n_factors: 2,
max_iter: 1000,
tol: 1e-6,
rotation: RotationType::Varimax,
random_state: None,
}
}
}
impl FactorAnalysis {
pub fn new(n_factors: usize) -> Result<Self> {
check_positive(n_factors, "n_factors")?;
Ok(Self {
n_factors,
..Default::default()
})
}
pub fn with_max_iter(mut self, max_iter: usize) -> Self {
self.max_iter = max_iter;
self
}
pub fn with_tolerance(mut self, tol: f64) -> Self {
self.tol = tol;
self
}
pub fn with_rotation(mut self, rotation: RotationType) -> Self {
self.rotation = rotation;
self
}
pub fn with_random_state(mut self, seed: u64) -> Self {
self.random_state = Some(seed);
self
}
pub fn fit(&self, data: ArrayView2<f64>) -> Result<FactorAnalysisResult> {
checkarray_finite(&data, "data")?;
let (n_samples, n_features) = data.dim();
if n_samples < 2 {
return Err(StatsError::InvalidArgument(
"n_samples must be at least 2".to_string(),
));
}
if self.n_factors >= n_features {
return Err(StatsError::InvalidArgument(format!(
"n_factors ({}) must be less than n_features ({})",
self.n_factors, n_features
)));
}
let mean = data.mean_axis(Axis(0)).expect("Operation failed");
let mut centereddata = data.to_owned();
for mut row in centereddata.rows_mut() {
row -= &mean;
}
let (mut loadings, mut psi) = self.initialize_parameters(¢ereddata)?;
let mut prev_log_likelihood = f64::NEG_INFINITY;
let mut n_iter = 0;
for iteration in 0..self.max_iter {
let (e_h, e_hht) = self.e_step(¢ereddata, &loadings, &psi)?;
let (new_loadings, new_psi) = self.m_step(¢ereddata, &e_h, &e_hht)?;
let log_likelihood =
self.compute_log_likelihood(¢ereddata, &new_loadings, &new_psi)?;
if (log_likelihood - prev_log_likelihood).abs() < self.tol {
loadings = new_loadings;
psi = new_psi;
n_iter = iteration + 1;
break;
}
loadings = new_loadings;
psi = new_psi;
prev_log_likelihood = log_likelihood;
n_iter = iteration + 1;
}
if n_iter == self.max_iter {
return Err(StatsError::ConvergenceError(format!(
"EM algorithm failed to converge after {} iterations",
self.max_iter
)));
}
let rotated_loadings = match self.rotation {
RotationType::None => loadings,
RotationType::Varimax => self.varimax_rotation(&loadings)?,
RotationType::Promax => self.promax_rotation(&loadings)?,
};
let scores = self.compute_factor_scores(¢ereddata, &rotated_loadings, &psi)?;
let explained_variance_ratio = self.compute_explained_variance(&rotated_loadings);
let communalities = self.compute_communalities(&rotated_loadings);
let final_log_likelihood =
self.compute_log_likelihood(¢ereddata, &rotated_loadings, &psi)?;
Ok(FactorAnalysisResult {
loadings: rotated_loadings,
noise_variance: psi,
scores,
mean,
log_likelihood: final_log_likelihood,
n_iter,
explained_variance_ratio,
communalities,
})
}
fn initialize_parameters(&self, data: &Array2<f64>) -> Result<(Array2<f64>, Array1<f64>)> {
let (n_samples, n_features) = data.dim();
let (_u, s, vt) = scirs2_linalg::svd(&data.view(), false, None).map_err(|e| {
StatsError::ComputationError(format!("SVD initialization failed: {}", e))
})?;
let v = vt.t().to_owned();
let mut loadings = Array2::zeros((n_features, self.n_factors));
for i in 0..self.n_factors {
let scale = (s[i] / (n_samples as f64).sqrt()).max(1e-6);
for j in 0..n_features {
loadings[[j, i]] = v[[j, i]] * scale;
}
}
let mut psi = Array1::ones(n_features);
for i in 0..n_features {
let communality = loadings.row(i).dot(&loadings.row(i));
psi[i] = (1.0 - communality).max(0.01); }
Ok((loadings, psi))
}
fn e_step(
&self,
data: &Array2<f64>,
loadings: &Array2<f64>,
psi: &Array1<f64>,
) -> Result<(Array2<f64>, Array2<f64>)> {
let (n_samples, n_features) = data.dim();
let mut psi_inv = Array2::zeros((n_features, n_features));
for i in 0..n_features {
if psi[i] <= 0.0 {
return Err(StatsError::ComputationError(
"Specific variances must be positive".to_string(),
));
}
psi_inv[[i, i]] = 1.0 / psi[i];
}
let lt_psi_inv = loadings.t().dot(&psi_inv);
let m = Array2::eye(self.n_factors) + lt_psi_inv.dot(loadings);
let m_inv = scirs2_linalg::inv(&m.view(), None).map_err(|e| {
StatsError::ComputationError(format!("Failed to invert M matrix: {}", e))
})?;
let mut e_h = Array2::zeros((n_samples, self.n_factors));
let e_hht = m_inv.clone();
for i in 0..n_samples {
let x = data.row(i);
let e_h_i = m_inv.dot(<_psi_inv.dot(&x.to_owned()));
e_h.row_mut(i).assign(&e_h_i);
}
Ok((e_h, e_hht))
}
fn m_step(
&self,
data: &Array2<f64>,
e_h: &Array2<f64>,
e_hht: &Array2<f64>,
) -> Result<(Array2<f64>, Array1<f64>)> {
let (n_samples, n_features) = data.dim();
let xte_h = data.t().dot(e_h);
let sum_e_hht = e_hht * n_samples as f64;
let sum_e_hht_inv = scirs2_linalg::inv(&sum_e_hht.view(), None).map_err(|e| {
StatsError::ComputationError(format!("Failed to invert sum E[HH^T]: {}", e))
})?;
let new_loadings = xte_h.dot(&sum_e_hht_inv);
let mut new_psi = Array1::zeros(n_features);
for j in 0..n_features {
let x_j = data.column(j);
let l_j = new_loadings.row(j);
let mut sum_var = 0.0;
for i in 0..n_samples {
let x_ij = x_j[i];
let e_h_i = e_h.row(i);
let residual = x_ij - l_j.dot(&e_h_i.to_owned());
sum_var += residual * residual;
let quad_form = l_j.dot(&e_hht.dot(&l_j.to_owned()));
sum_var += quad_form;
}
new_psi[j] = (sum_var / n_samples as f64).max(1e-6); }
Ok((new_loadings, new_psi))
}
fn compute_log_likelihood(
&self,
data: &Array2<f64>,
loadings: &Array2<f64>,
psi: &Array1<f64>,
) -> Result<f64> {
let (n_samples, n_features) = data.dim();
let ll_t = loadings.dot(&loadings.t());
let mut sigma = ll_t;
for i in 0..n_features {
sigma[[i, i]] += psi[i];
}
let det_sigma = scirs2_linalg::det(&sigma.view(), None).map_err(|e| {
StatsError::ComputationError(format!("Failed to compute determinant: {}", e))
})?;
if det_sigma <= 0.0 {
return Err(StatsError::ComputationError(
"Covariance matrix must be positive definite".to_string(),
));
}
let sigma_inv = scirs2_linalg::inv(&sigma.view(), None).map_err(|e| {
StatsError::ComputationError(format!("Failed to invert covariance: {}", e))
})?;
let mut log_likelihood = 0.0;
let log_det_term =
-0.5 * n_features as f64 * (2.0 * std::f64::consts::PI).ln() - 0.5 * det_sigma.ln();
for i in 0..n_samples {
let x = data.row(i);
let quad_form = x.dot(&sigma_inv.dot(&x.to_owned()));
log_likelihood += log_det_term - 0.5 * quad_form;
}
Ok(log_likelihood)
}
fn varimax_rotation(&self, loadings: &Array2<f64>) -> Result<Array2<f64>> {
let (n_features, n_factors) = loadings.dim();
let mut rotated = loadings.clone();
let max_iter = 30;
let tol = 1e-6;
for _ in 0..max_iter {
let rotation_matrix = Array2::<f64>::eye(n_factors);
let mut converged = true;
for i in 0..n_factors {
for j in (i + 1)..n_factors {
let col_i = rotated.column(i).to_owned();
let col_j = rotated.column(j).to_owned();
let u = &col_i * &col_i - &col_j * &col_j;
let v = 2.0 * &col_i * &col_j;
let a = u.sum();
let b = v.sum();
let c = (&u * &u - &v * &v).sum();
let d = 2.0 * (&u * &v).sum();
let num = d - 2.0 * a * b / n_features as f64;
let den = c - (a * a - b * b) / n_features as f64;
if den.abs() < 1e-10 {
continue;
}
let phi = 0.25 * (num / den).atan();
if phi.abs() > tol {
converged = false;
let cos_phi = phi.cos();
let sin_phi = phi.sin();
let new_col_i = cos_phi * &col_i - sin_phi * &col_j;
let new_col_j = sin_phi * &col_i + cos_phi * &col_j;
rotated.column_mut(i).assign(&new_col_i);
rotated.column_mut(j).assign(&new_col_j);
}
}
}
if converged {
break;
}
}
Ok(rotated)
}
fn promax_rotation(&self, loadings: &Array2<f64>) -> Result<Array2<f64>> {
let varimax_rotated = self.varimax_rotation(loadings)?;
let kappa = 4.0; let (n_features, n_factors) = varimax_rotated.dim();
let mut target = Array2::zeros((n_features, n_factors));
for i in 0..n_features {
for j in 0..n_factors {
let val = varimax_rotated[[i, j]];
target[[i, j]] = val.abs().powf(kappa) * val.signum();
}
}
let ltl = varimax_rotated.t().dot(&varimax_rotated);
let ltl_inv = scirs2_linalg::inv(<l.view(), None)
.map_err(|e| StatsError::ComputationError(format!("Failed to invert L^T L: {}", e)))?;
let ltp = varimax_rotated.t().dot(&target);
let transform = ltl_inv.dot(<p);
let rotated = varimax_rotated.dot(&transform);
Ok(rotated)
}
fn compute_factor_scores(
&self,
data: &Array2<f64>,
loadings: &Array2<f64>,
psi: &Array1<f64>,
) -> Result<Array2<f64>> {
let n_features = loadings.nrows();
let mut psi_inv = Array2::zeros((n_features, n_features));
for i in 0..n_features {
psi_inv[[i, i]] = 1.0 / psi[i];
}
let lt_psi_inv = loadings.t().dot(&psi_inv);
let lt_psi_inv_l = lt_psi_inv.dot(loadings);
let lt_psi_inv_l_inv = scirs2_linalg::inv(<_psi_inv_l.view(), None).map_err(|e| {
StatsError::ComputationError(format!("Failed to compute factor score weights: {}", e))
})?;
let score_weights = lt_psi_inv_l_inv.dot(<_psi_inv);
let scores = data.dot(&score_weights.t());
Ok(scores)
}
fn compute_explained_variance(&self, loadings: &Array2<f64>) -> Array1<f64> {
let factor_variances = loadings
.axis_iter(Axis(1))
.map(|col| col.dot(&col))
.collect::<Vec<_>>();
let total_variance: f64 = factor_variances.iter().sum();
Array1::from_vec(factor_variances).mapv(|v| v / total_variance)
}
fn compute_communalities(&self, loadings: &Array2<f64>) -> Array1<f64> {
let mut communalities = Array1::zeros(loadings.nrows());
for i in 0..loadings.nrows() {
communalities[i] = loadings.row(i).dot(&loadings.row(i));
}
communalities
}
pub fn transform(
&self,
data: ArrayView2<f64>,
result: &FactorAnalysisResult,
) -> Result<Array2<f64>> {
checkarray_finite(&data, "data")?;
if data.ncols() != result.mean.len() {
return Err(StatsError::DimensionMismatch(format!(
"data has {} features, expected {}",
data.ncols(),
result.mean.len()
)));
}
let mut centered = data.to_owned();
for mut row in centered.rows_mut() {
row -= &result.mean;
}
self.compute_factor_scores(¢ered, &result.loadings, &result.noise_variance)
}
}
pub mod efa {
use super::*;
pub fn parallel_analysis(
data: ArrayView2<f64>,
n_simulations: usize,
percentile: f64,
seed: Option<u64>,
) -> Result<usize> {
checkarray_finite(&data, "data")?;
check_positive(n_simulations, "n_simulations")?;
if percentile <= 0.0 || percentile >= 100.0 {
return Err(StatsError::InvalidArgument(
"percentile must be between 0 and 100".to_string(),
));
}
let (n_samples, n_features) = data.dim();
let real_eigenvalues = compute_correlation_eigenvalues(data)?;
let mut rng = match seed {
Some(s) => StdRng::seed_from_u64(s),
None => {
use std::time::{SystemTime, UNIX_EPOCH};
let s = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_secs();
StdRng::seed_from_u64(s)
}
};
let mut simulated_eigenvalues = Vec::with_capacity(n_simulations);
for _ in 0..n_simulations {
let mut randomdata = Array2::zeros((n_samples, n_features));
use scirs2_core::random::{Distribution, Normal};
let normal = Normal::new(0.0, 1.0).map_err(|e| {
StatsError::ComputationError(format!("Failed to create normal distribution: {}", e))
})?;
for i in 0..n_samples {
for j in 0..n_features {
randomdata[[i, j]] = normal.sample(&mut rng);
}
}
let eigenvalues = compute_correlation_eigenvalues(randomdata.view())?;
simulated_eigenvalues.push(eigenvalues);
}
let mut thresholds = Array1::zeros(n_features);
for i in 0..n_features {
let mut values: Vec<f64> = simulated_eigenvalues.iter().map(|ev| ev[i]).collect();
values.sort_by(|a, b| a.partial_cmp(b).expect("Operation failed"));
let index = ((percentile / 100.0) * (n_simulations - 1) as f64).round() as usize;
thresholds[i] = values[index.min(n_simulations - 1)];
}
let mut n_factors = 0;
for i in 0..n_features {
if real_eigenvalues[i] > thresholds[i] {
n_factors += 1;
} else {
break;
}
}
Ok(n_factors.max(1)) }
fn compute_correlation_eigenvalues(data: ArrayView2<f64>) -> Result<Array1<f64>> {
let mean = data.mean_axis(Axis(0)).expect("Operation failed");
let mut centered = data.to_owned();
for mut row in centered.rows_mut() {
row -= &mean;
}
let cov = centered.t().dot(¢ered) / (data.nrows() - 1) as f64;
let mut corr = cov.clone();
for i in 0..corr.nrows() {
for j in 0..corr.ncols() {
let std_i = cov[[i, i]].sqrt();
let std_j = cov[[j, j]].sqrt();
if std_i > 1e-10 && std_j > 1e-10 {
corr[[i, j]] = cov[[i, j]] / (std_i * std_j);
}
}
}
let (eigenvalues, _eigenvectors) =
scirs2_linalg::eigh_f64_lapack(&corr.view()).map_err(|e| {
StatsError::ComputationError(format!("Eigenvalue decomposition failed: {}", e))
})?;
let mut sorted_eigenvalues: Vec<f64> = eigenvalues.to_vec();
sorted_eigenvalues.sort_by(|a: &f64, b: &f64| b.partial_cmp(a).expect("Operation failed"));
Ok(Array1::from_vec(sorted_eigenvalues))
}
pub fn kmo_test(data: ArrayView2<f64>) -> Result<f64> {
checkarray_finite(&data, "data")?;
let mean = data.mean_axis(Axis(0)).expect("Operation failed");
let mut centered = data.to_owned();
for mut row in centered.rows_mut() {
row -= &mean;
}
let cov = centered.t().dot(¢ered) / (data.nrows() - 1) as f64;
let n = cov.nrows();
let mut corr = Array2::zeros((n, n));
for i in 0..n {
for j in 0..n {
let std_i = cov[[i, i]].sqrt();
let std_j = cov[[j, j]].sqrt();
if std_i > 1e-10 && std_j > 1e-10 {
corr[[i, j]] = cov[[i, j]] / (std_i * std_j);
} else if i == j {
corr[[i, j]] = 1.0;
}
}
}
let corr_inv = scirs2_linalg::inv(&corr.view(), None).map_err(|e| {
StatsError::ComputationError(format!("Failed to invert correlation matrix: {}", e))
})?;
let mut sum_squared_corr = 0.0;
let mut sum_squared_partial = 0.0;
for i in 0..n {
for j in 0..n {
if i != j {
sum_squared_corr += corr[[i, j]] * corr[[i, j]];
let partial = -corr_inv[[i, j]] / (corr_inv[[i, i]] * corr_inv[[j, j]]).sqrt();
sum_squared_partial += partial * partial;
}
}
}
let kmo = sum_squared_corr / (sum_squared_corr + sum_squared_partial);
Ok(kmo)
}
pub fn bartlett_test(data: ArrayView2<f64>) -> Result<(f64, f64)> {
checkarray_finite(&data, "data")?;
let (n, p) = data.dim();
if n <= p {
return Err(StatsError::InvalidArgument(
"Number of samples must exceed number of variables".to_string(),
));
}
let mean = data.mean_axis(Axis(0)).expect("Operation failed");
let mut centered = data.to_owned();
for mut row in centered.rows_mut() {
row -= &mean;
}
let cov = centered.t().dot(¢ered) / (n - 1) as f64;
let mut corr = Array2::zeros((p, p));
for i in 0..p {
for j in 0..p {
let std_i = cov[[i, i]].sqrt();
let std_j = cov[[j, j]].sqrt();
if std_i > 1e-10 && std_j > 1e-10 {
corr[[i, j]] = cov[[i, j]] / (std_i * std_j);
} else if i == j {
corr[[i, j]] = 1.0;
}
}
}
let det_corr = scirs2_linalg::det(&corr.view(), None).map_err(|e| {
StatsError::ComputationError(format!("Failed to compute determinant: {}", e))
})?;
if det_corr <= 0.0 {
return Err(StatsError::ComputationError(
"Correlation matrix must be positive definite".to_string(),
));
}
let chi2 = -(n as f64 - 1.0 - (2.0 * p as f64 + 5.0) / 6.0) * det_corr.ln();
let df = p * (p - 1) / 2;
let p_value = chi2_survival(chi2, df as f64);
Ok((chi2, p_value))
}
}
#[allow(dead_code)]
fn chi2_survival(x: f64, df: f64) -> f64 {
if x <= 0.0 {
return 1.0;
}
let mean = df;
let var = 2.0 * df;
let std = var.sqrt();
if df > 30.0 {
let z = (x - mean) / std;
return 0.5 * (1.0 - erf(z / std::f64::consts::SQRT_2));
}
(-x / mean).exp()
}
#[allow(dead_code)]
fn erf(x: f64) -> f64 {
let a1 = 0.254829592;
let a2 = -0.284496736;
let a3 = 1.421413741;
let a4 = -1.453152027;
let a5 = 1.061405429;
let p = 0.3275911;
let sign = if x >= 0.0 { 1.0 } else { -1.0 };
let x = x.abs();
let t = 1.0 / (1.0 + p * x);
let y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * (-x * x).exp();
sign * y
}