use crate::lm::{LmConfig, lmdif};
use nalgebra::{DMatrix, DVector, SymmetricEigen};
#[derive(Debug, Clone)]
pub struct PcaModel {
pub ydat: Vec<Vec<f64>>,
pub mean: Vec<f64>,
pub components: Vec<Vec<f64>>,
pub eigenvalues: Vec<f64>,
pub variances: Vec<f64>,
pub ind: Vec<f64>,
pub nsig: usize,
}
#[derive(Debug, Clone)]
pub struct PcaFit {
pub weights: Vec<f64>,
pub yfit: Vec<f64>,
pub ydat: Vec<f64>,
pub chi_square: f64,
pub data_scale: f64,
}
fn from_internal(val: f64) -> f64 {
-1.0 + (val * val + 1.0).sqrt()
}
fn to_internal(scale: f64) -> f64 {
((scale + 1.0).powi(2) - 1.0).sqrt()
}
fn lstsq(a: &DMatrix<f64>, b: &DVector<f64>) -> (DVector<f64>, f64) {
let svd = a.clone().svd(true, true);
let x = svd.solve(b, 1.0e-14).expect("lstsq SVD solve failed");
let resid = a * &x - b;
(x, resid.norm_squared())
}
pub fn pca_train(ydat: &[Vec<f64>]) -> PcaModel {
let narr = ydat.len();
assert!(narr >= 2, "need at least 2 training spectra");
let nfreq = ydat[0].len();
for (i, row) in ydat.iter().enumerate() {
assert_eq!(row.len(), nfreq, "spectrum {i} length mismatch");
}
let nf = nfreq as f64;
let na = narr as f64;
let mut mean = vec![0.0; nfreq];
for row in ydat {
for (j, &v) in row.iter().enumerate() {
mean[j] += v;
}
}
for m in &mut mean {
*m /= na;
}
let a: Vec<Vec<f64>> = ydat
.iter()
.map(|row| row.iter().zip(&mean).map(|(&v, &m)| v - m).collect())
.collect();
let mut z = DMatrix::<f64>::zeros(nfreq, narr);
for (i, row) in a.iter().enumerate() {
let rmean = row.iter().sum::<f64>() / nf;
let var = row.iter().map(|&v| (v - rmean).powi(2)).sum::<f64>() / nf;
let rstd = var.sqrt();
for (j, &v) in row.iter().enumerate() {
z[(j, i)] = (v - rmean) / rstd;
}
}
let cov = (z.transpose() * &z) / na;
let (eval_asc, evec_asc) = eigh_ascending(&cov);
let m = (&z * (-&evec_asc)) / na; let mut eigenvalues = vec![0.0; narr];
let mut components = vec![vec![0.0; nfreq]; narr];
for k in 0..narr {
let src = narr - 1 - k;
eigenvalues[k] = eval_asc[src];
for j in 0..nfreq {
components[k][j] = m[(j, src)];
}
}
let esum: f64 = eigenvalues.iter().sum();
let variances: Vec<f64> = eigenvalues.iter().map(|&e| e / esum).collect();
let mut ind: Vec<f64> = Vec::new();
for r in 0..narr - 1 {
let nr = (narr - r - 1) as f64;
let suffix: f64 = eigenvalues[r..].iter().sum();
let indval = (nf * suffix / nr).sqrt() / nr.powi(2);
if ind.is_empty() {
ind.push(indval);
}
ind.push(indval);
}
let mut nsig = 0;
let mut best = ind[0];
for (i, &v) in ind.iter().enumerate().skip(1) {
if v < best {
best = v;
nsig = i;
}
}
PcaModel {
ydat: ydat.to_vec(),
mean,
components,
eigenvalues,
variances,
ind,
nsig,
}
}
fn eigh_ascending(cov: &DMatrix<f64>) -> (Vec<f64>, DMatrix<f64>) {
let n = cov.nrows();
let se = SymmetricEigen::new(cov.clone());
let mut idx: Vec<usize> = (0..n).collect();
idx.sort_by(|&i, &j| se.eigenvalues[i].partial_cmp(&se.eigenvalues[j]).unwrap());
let evals: Vec<f64> = idx.iter().map(|&i| se.eigenvalues[i]).collect();
let mut evecs = DMatrix::<f64>::zeros(n, n);
for (col, &src) in idx.iter().enumerate() {
evecs.set_column(col, &se.eigenvectors.column(src));
}
(evals, evecs)
}
pub fn pca_fit(unknown: &[f64], model: &PcaModel, ncomps: usize, rescale: bool) -> PcaFit {
let nfreq = model.mean.len();
assert_eq!(unknown.len(), nfreq, "unknown length mismatch");
let ncomps = ncomps.min(model.components.len());
let comps = DMatrix::from_fn(nfreq, ncomps, |r, c| model.components[c][r]);
let mean = DVector::from_row_slice(&model.mean);
let unk = DVector::from_row_slice(unknown);
let scale = if rescale {
let resid = |p: &[f64]| -> Vec<f64> {
let sc = from_internal(p[0]);
let b = &unk * sc - &mean;
let (w, _) = lstsq(&comps, &b);
let yfit = &comps * &w + &mean;
(&unk * sc - yfit).iter().copied().collect()
};
let cfg = LmConfig {
ftol: 1.0e-5,
xtol: 1.0e-5,
gtol: 1.0e-5,
maxfev: 2000 * 2,
epsfcn: 1.0e-5,
factor: 100.0,
};
let seed = [to_internal(1.0)];
let result = lmdif(resid, &seed, &cfg);
from_internal(result.x[0])
} else {
1.0
};
let ydat_scaled = &unk * scale;
let b = &ydat_scaled - &mean;
let (w, chi_square) = lstsq(&comps, &b);
let yfit = &comps * &w + &mean;
PcaFit {
weights: w.iter().copied().collect(),
yfit: yfit.iter().copied().collect(),
ydat: ydat_scaled.iter().copied().collect(),
chi_square,
data_scale: scale,
}
}