#![allow(clippy::disallowed_methods)]
use aprender::preprocessing::{StandardScaler, PCA};
use aprender::primitives::Matrix;
use aprender::traits::Transformer;
fn create_iris_data() -> (Matrix<f32>, Vec<&'static str>) {
let data = Matrix::from_vec(
30,
4,
vec![
5.1, 3.5, 1.4, 0.2, 4.9, 3.0, 1.4, 0.2, 4.7, 3.2, 1.3, 0.2, 4.6, 3.1, 1.5, 0.2, 5.0,
3.6, 1.4, 0.2, 5.4, 3.9, 1.7, 0.4, 4.6, 3.4, 1.4, 0.3, 5.0, 3.4, 1.5, 0.2, 4.4, 2.9,
1.4, 0.2, 4.9, 3.1, 1.5, 0.1, 7.0, 3.2, 4.7, 1.4, 6.4, 3.2, 4.5, 1.5, 6.9, 3.1, 4.9, 1.5, 5.5, 2.3, 4.0, 1.3, 6.5,
2.8, 4.6, 1.5, 5.7, 2.8, 4.5, 1.3, 6.3, 3.3, 4.7, 1.6, 4.9, 2.4, 3.3, 1.0, 6.6, 2.9,
4.6, 1.3, 5.2, 2.7, 3.9, 1.4, 6.3, 3.3, 6.0, 2.5, 5.8, 2.7, 5.1, 1.9, 7.1, 3.0, 5.9, 2.1, 6.3, 2.9, 5.6, 1.8, 6.5,
3.0, 5.8, 2.2, 7.6, 3.0, 6.6, 2.1, 4.9, 2.5, 4.5, 1.7, 7.3, 2.9, 6.3, 1.8, 6.7, 2.5,
5.8, 1.8, 7.2, 3.6, 6.1, 2.5,
],
)
.expect("Example data should be valid");
let species = vec![
"Setosa",
"Setosa",
"Setosa",
"Setosa",
"Setosa",
"Setosa",
"Setosa",
"Setosa",
"Setosa",
"Setosa",
"Versicolor",
"Versicolor",
"Versicolor",
"Versicolor",
"Versicolor",
"Versicolor",
"Versicolor",
"Versicolor",
"Versicolor",
"Versicolor",
"Virginica",
"Virginica",
"Virginica",
"Virginica",
"Virginica",
"Virginica",
"Virginica",
"Virginica",
"Virginica",
"Virginica",
];
(data, species)
}
fn standardize_and_reduce(data: &Matrix<f32>) -> (StandardScaler, PCA, Matrix<f32>, Matrix<f32>) {
let (_, n_features) = data.shape();
println!("āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā");
println!(" Step 1: Standardizing Features");
println!("āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā\n");
let mut scaler = StandardScaler::new();
let scaled_data = scaler
.fit_transform(data)
.expect("Example data should be valid");
println!("Why standardize?");
println!(" PCA is sensitive to feature scales. Standardizing ensures");
println!(" all features contribute equally to principal components.\n");
println!("āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā");
println!(" Step 2: Applying PCA (4D ā 2D)");
println!("āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā\n");
let mut pca = PCA::new(2);
let transformed = pca
.fit_transform(&scaled_data)
.expect("Example data should be valid");
println!("Dimensionality Reduction:");
println!(" Original: {n_features} features");
println!(" Reduced: {} principal components\n", 2);
(scaler, pca, scaled_data, transformed)
}
fn analyze_explained_variance(pca: &PCA) -> f32 {
println!("āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā");
println!(" Step 3: Explained Variance Analysis");
println!("āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā\n");
let explained_var = pca
.explained_variance()
.expect("Example data should be valid");
let explained_ratio = pca
.explained_variance_ratio()
.expect("Example data should be valid");
println!("Explained Variance by Component:");
for i in 0..2 {
let variance_pct = explained_ratio[i] * 100.0;
let bar_length = (variance_pct / 2.0) as usize;
let bar = "ā".repeat(bar_length);
println!(
" PC{}: {:.4} ({:5.2}%) {}",
i + 1,
explained_var[i],
variance_pct,
bar
);
}
let total_variance: f32 = explained_ratio.iter().sum();
println!("\nTotal Variance Captured: {:.2}%", total_variance * 100.0);
println!(
"Information Lost: {:.2}%\n",
(1.0 - total_variance) * 100.0
);
println!("š” Interpretation:");
println!(
" First two components capture {:.1}% of variance",
total_variance * 100.0
);
println!(" This is excellent for 2D visualization!\n");
total_variance
}
fn display_transformed_data(transformed: &Matrix<f32>, species: &[&str]) {
println!("āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā");
println!(" Step 4: Transformed Data (First 5 samples per species)");
println!("āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā\n");
println!(
"{:>6} {:>12} {:>10} {:>10}",
"Sample", "Species", "PC1", "PC2"
);
println!("{}", "ā".repeat(44));
for i in &[0, 1, 2, 3, 4, 10, 11, 12, 13, 14, 20, 21, 22, 23, 24] {
println!(
"{:>6} {:>12} {:>10.4} {:>10.4}",
i,
species[*i],
transformed.get(*i, 0),
transformed.get(*i, 1)
);
}
println!();
}
fn reconstruction_analysis(
data: &Matrix<f32>,
pca: &PCA,
scaler: &StandardScaler,
transformed: &Matrix<f32>,
) -> f32 {
let (n_samples, n_features) = data.shape();
println!("āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā");
println!(" Step 5: Reconstruction from 2D ā 4D");
println!("āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā\n");
let reconstructed_scaled = pca
.inverse_transform(transformed)
.expect("Example data should be valid");
let reconstructed = scaler
.inverse_transform(&reconstructed_scaled)
.expect("Example data should be valid");
let mut total_squared_error = 0.0;
let mut max_error = 0.0;
for i in 0..n_samples {
for j in 0..n_features {
let error = (data.get(i, j) - reconstructed.get(i, j)).abs();
total_squared_error += error * error;
if error > max_error {
max_error = error;
}
}
}
let mse = total_squared_error / (n_samples * n_features) as f32;
let rmse = mse.sqrt();
println!("Reconstruction Error Metrics:");
println!(" MSE: {mse:.6}");
println!(" RMSE: {rmse:.6}");
println!(" Max Error: {max_error:.6}\n");
println!("Sample Reconstruction (First 3 samples):");
println!(
"{:>8} {:>10} {:>14}",
"Feature", "Original", "Reconstructed"
);
println!("{}", "ā".repeat(34));
let feature_names = ["Sepal L", "Sepal W", "Petal L", "Petal W"];
for sample_idx in 0..3 {
println!("\nSample {sample_idx}:");
for (j, fname) in feature_names.iter().enumerate() {
println!(
"{:>8} {:>10.4} {:>14.4}",
fname,
data.get(sample_idx, j),
reconstructed.get(sample_idx, j)
);
}
}
println!();
println!("š” Interpretation:");
println!(" RMSE of {rmse:.4} means typical reconstruction error is small");
println!(" Dimensionality reduction preserved most information!\n");
rmse
}
fn show_component_loadings(pca: &PCA) {
println!("āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā");
println!(" Step 6: Principal Components (Feature Loadings)");
println!("āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā\n");
let components = pca.components().expect("Example data should be valid");
println!(
"{:>10} {:>10} {:>10} {:>10} {:>10}",
"Component", "Sepal L", "Sepal W", "Petal L", "Petal W"
);
println!("{}", "ā".repeat(54));
for i in 0..2 {
println!(
"{:>10} {:>10.4} {:>10.4} {:>10.4} {:>10.4}",
format!("PC{}", i + 1),
components.get(i, 0),
components.get(i, 1),
components.get(i, 2),
components.get(i, 3)
);
}
println!();
println!("š” Feature Importance:");
println!(" Larger absolute values = more important for that component");
println!(" PC1 likely captures overall flower size");
println!(" PC2 likely captures petal vs sepal differences\n");
}
fn print_summary(total_variance: f32, rmse: f32) {
println!("āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā");
println!(" Summary");
println!("āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā\n");
println!("Key Findings:");
println!(
" ⢠Reduced 4D iris data to 2D with {:.1}% variance retained",
total_variance * 100.0
);
println!(" ⢠RMSE of {rmse:.4} shows good reconstruction quality");
println!(" ⢠2D representation suitable for visualization");
println!(" ⢠Three species likely separable in 2D PC space\n");
println!("When to use PCA:");
println!(" ā Visualizing high-dimensional data");
println!(" ā Reducing features before ML training");
println!(" ā Removing correlated features");
println!(" ā Denoising data");
println!(" ā Compression\n");
println!("ā ļø PCA Assumptions:");
println!(" ⢠Linear relationships between features");
println!(" ⢠High variance = high importance");
println!(" ⢠Features should be standardized first\n");
println!("š Performance Notes:");
println!(" ⢠nalgebra's SymmetricEigen: O(n³) for nĆn covariance matrix");
println!(" ⢠Transform: O(n_samples à n_components à n_features)");
println!(" ⢠Memory: O(n_components à n_features) for components\n");
println!("āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā");
println!(" Example Complete!");
println!("āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā");
}
fn main() {
println!("āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā");
println!(" PCA Dimensionality Reduction - Iris Dataset");
println!("āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā\n");
let (data, species) = create_iris_data();
let (n_samples, n_features) = data.shape();
println!("š Dataset Information:");
println!(" Samples: {n_samples}");
println!(" Features: {n_features} (sepal length, sepal width, petal length, petal width)");
println!(" Species: Setosa, Versicolor, Virginica (10 each)\n");
let (scaler, pca, _scaled_data, transformed) = standardize_and_reduce(&data);
let total_variance = analyze_explained_variance(&pca);
display_transformed_data(&transformed, &species);
let rmse = reconstruction_analysis(&data, &pca, &scaler, &transformed);
show_component_loadings(&pca);
print_summary(total_variance, rmse);
}