//! Optimized Dimension Reduction Example
//!
//! This example demonstrates how to use PandRS's dimension reduction algorithms (PCA, t-SNE)
//! with the optimized DataFrame implementation and GPU acceleration. It shows how to create
//! a synthetic dataset, perform dimension reduction, and analyze the results.
//!
//! To run this example:
//! cargo run --example optimized_ml_dimension_reduction_example --features "optimized cuda"
//!
#[cfg(all(feature = "optimized", feature = "cuda"))]
use pandrs::column::{Column, ColumnTrait, Float64Column, StringColumn};
#[cfg(all(feature = "optimized", feature = "cuda"))]
use pandrs::ml::dimension::{TSNEInit, PCA, TSNE};
#[cfg(all(feature = "optimized", feature = "cuda"))]
use pandrs::ml::{Transformer, UnsupervisedModel};
#[cfg(all(feature = "optimized", feature = "cuda"))]
use pandrs::{error::Error, OptimizedDataFrame};
#[cfg(all(feature = "optimized", feature = "cuda"))]
use rand::Rng;
#[cfg(all(feature = "optimized", feature = "cuda"))]
#[allow(clippy::result_large_err)]
fn main() -> Result<(), Error> {
println!("Optimized ML dimension reduction example is currently disabled due to ongoing API changes.");
println!("This example will be re-enabled in a future release.");
Ok(())
}
#[cfg(all(feature = "optimized", feature = "cuda"))]
#[allow(dead_code)]
#[allow(clippy::result_large_err)]
fn _disabled_main() -> Result<(), Error> {
println!("Example of PandRS Dimension Reduction Algorithms");
println!("============================");
// Generate sample data
let df = create_sample_data()?;
println!(
"Original DataFrame: {:?} rows x {:?} columns",
df.row_count(),
df.column_names().len()
);
// Example of PCA
pca_example(&df)?;
// Example of t-SNE
tsne_example(&df)?;
Ok(())
}
#[cfg(all(feature = "optimized", feature = "cuda"))]
// Example of PCA
#[allow(clippy::result_large_err)]
#[allow(dead_code)]
fn pca_example(df: &OptimizedDataFrame) -> Result<(), Error> {
println!("\n==== PCA (Principal Component Analysis) ====");
// Create PCA instance (reduce to 2 dimensions)
let mut pca = PCA::new(2, true);
// Execute PCA
let pca_result = pca.fit_transform(df)?;
println!(
"\nDataFrame after PCA: {:?} rows x {:?} columns",
pca_result.row_count(),
pca_result.column_names().len()
);
// Display values of Principal Component 1 and Principal Component 2
if let Ok(pc1_view) = pca_result.column("PC1") {
if let Some(pc1) = pc1_view.as_float64() {
// Manually calculate mean and standard deviation
let values: Vec<f64> = (0..pc1.len())
.filter_map(|i| pc1.get(i).ok().flatten())
.collect();
let mean = if !values.is_empty() {
values.iter().sum::<f64>() / values.len() as f64
} else {
0.0
};
let std_dev = if values.len() > 1 {
let var = values.iter().map(|&x| (x - mean).powi(2)).sum::<f64>()
/ (values.len() - 1) as f64;
var.sqrt()
} else {
0.0
};
println!("\nMean of Principal Component 1: {:.4}", mean);
println!(
"Standard Deviation of Principal Component 1: {:.4}",
std_dev
);
}
}
if let Ok(pc2_view) = pca_result.column("PC2") {
if let Some(pc2) = pc2_view.as_float64() {
// Manually calculate mean and standard deviation
let values: Vec<f64> = (0..pc2.len())
.filter_map(|i| pc2.get(i).ok().flatten())
.collect();
let mean = if !values.is_empty() {
values.iter().sum::<f64>() / values.len() as f64
} else {
0.0
};
let std_dev = if values.len() > 1 {
let var = values.iter().map(|&x| (x - mean).powi(2)).sum::<f64>()
/ (values.len() - 1) as f64;
var.sqrt()
} else {
0.0
};
println!("Mean of Principal Component 2: {:.4}", mean);
println!(
"Standard Deviation of Principal Component 2: {:.4}",
std_dev
);
}
}
// Display explained variance ratio
println!("\nExplained Variance Ratio:");
let explained_variance = pca.explained_variance;
let total_variance = explained_variance.iter().sum::<f64>();
// Calculate ratios ourselves
let mut cumulative = 0.0;
let mut cumulative_ratios = Vec::new();
for (i, &variance) in explained_variance.iter().enumerate() {
let ratio = variance / total_variance;
cumulative += ratio;
cumulative_ratios.push(cumulative);
println!("PC{}: {:.4} ({:.2}%)", i + 1, ratio, ratio * 100.0);
}
println!("\nCumulative Explained Variance:");
for (i, ratio) in cumulative_ratios.iter().enumerate() {
println!("PC1-PC{}: {:.4} ({:.2}%)", i + 1, ratio, ratio * 100.0);
}
// Calculate mean of principal component values for each cluster
calculate_cluster_means(&pca_result, "PC1", "PC2")?;
// PCA to 3 dimensions (higher dimensions)
println!("\n==== 3D PCA ====");
let mut pca3d = PCA::new(3, true);
let pca3d_result = pca3d.fit_transform(df)?;
println!(
"\nDataFrame after 3D PCA: {:?} rows x {:?} columns",
pca3d_result.row_count(),
pca3d_result.column_names().len()
);
// Display explained variance ratio
println!("\nExplained Variance Ratio of 3D PCA:");
let explained_variance = pca3d.explained_variance;
let total_variance = explained_variance.iter().sum::<f64>();
// Calculate ratios ourselves
let mut cumulative = 0.0;
let mut cumulative_ratios = Vec::new();
for (i, &variance) in explained_variance.iter().enumerate() {
let ratio = variance / total_variance;
cumulative += ratio;
cumulative_ratios.push(cumulative);
println!("PC{}: {:.4} ({:.2}%)", i + 1, ratio, ratio * 100.0);
}
// Display cumulative variance
let last_cumulative = cumulative_ratios.last().unwrap_or(&0.0);
println!(
"Cumulative Explained Variance: {:.4} ({:.2}%)",
last_cumulative,
last_cumulative * 100.0
);
Ok(())
}
#[cfg(all(feature = "optimized", feature = "cuda"))]
// Example of t-SNE
#[allow(clippy::result_large_err)]
#[allow(dead_code)]
fn tsne_example(df: &OptimizedDataFrame) -> Result<(), Error> {
println!("\n==== t-SNE (t-distributed Stochastic Neighbor Embedding) ====");
// Create t-SNE instance (reduce to 2 dimensions)
let mut tsne = TSNE::new();
// Configure the TSNE instance
tsne.n_components = 2;
tsne.perplexity = 30.0;
tsne.learning_rate = 200.0;
tsne.max_iter = 100;
tsne.init = TSNEInit::PCA;
println!("\nRunning t-SNE (this may take some time)...");
// Execute t-SNE
let tsne_result = tsne.fit_transform(df)?;
println!(
"DataFrame after t-SNE: {:?} rows x {:?} columns",
tsne_result.row_count(),
tsne_result.column_names().len()
);
// Calculate mean of embedding coordinates for each cluster
calculate_cluster_means(&tsne_result, "TSNE1", "TSNE2")?;
Ok(())
}
#[cfg(all(feature = "optimized", feature = "cuda"))]
// Calculate and display mean coordinates for each cluster
#[allow(clippy::result_large_err)]
#[allow(dead_code)]
fn calculate_cluster_means(df: &OptimizedDataFrame, x_col: &str, y_col: &str) -> Result<(), Error> {
// Do nothing if there is no cluster column
if !df.column_names().contains(&"cluster".to_string()) {
return Ok(());
}
println!("\nMean Coordinates for Each Cluster:");
let cluster_view = df.column("cluster")?;
let x_view = df.column(x_col)?;
let y_view = df.column(y_col)?;
if let Some(cluster_col) = cluster_view.as_string() {
if let (Some(x_col), Some(y_col)) = (x_view.as_float64(), y_view.as_float64()) {
// Get cluster types
let mut clusters = Vec::new();
for i in 0..df.row_count() {
if let Ok(Some(cluster)) = cluster_col.get(i) {
let cluster_str = cluster.to_string();
if !clusters.contains(&cluster_str) {
clusters.push(cluster_str);
}
}
}
// Calculate sum and count for each cluster
let mut cluster_sums = std::collections::HashMap::new();
for i in 0..df.row_count() {
if let (Ok(Some(cluster)), Ok(Some(x)), Ok(Some(y))) =
(cluster_col.get(i), x_col.get(i), y_col.get(i))
{
let entry = cluster_sums
.entry(cluster.to_string())
.or_insert((0.0, 0.0, 0));
entry.0 += x;
entry.1 += y;
entry.2 += 1;
}
}
// Calculate and display mean for each cluster
for cluster in clusters {
if let Some(&(x_sum, y_sum, count)) = cluster_sums.get(&cluster) {
let x_mean = x_sum / count as f64;
let y_mean = y_sum / count as f64;
println!(
"Cluster {}: ({:.4}, {:.4}), {} samples",
cluster, x_mean, y_mean, count
);
}
}
}
}
Ok(())
}
#[cfg(all(feature = "optimized", feature = "cuda"))]
// Generate sample data (high-dimensional data with 3 clusters)
#[allow(clippy::result_large_err)]
#[allow(dead_code)]
fn create_sample_data() -> Result<OptimizedDataFrame, Error> {
let mut rng = rand::thread_rng();
let mut df = OptimizedDataFrame::new();
// Generate data for 300 samples
let n_samples = 300;
let n_features = 10; // 10-dimensional data
// Define clusters
let clusters = ["A", "B", "C"];
let centers = [
vec![0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], // Center of Cluster A
vec![5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0], // Center of Cluster B
vec![-5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0], // Center of Cluster C
];
// Generate values for each feature
for j in 0..n_features {
let mut feature_values = Vec::with_capacity(n_samples);
// Data for Cluster A (center [0, 0, ...], small variance)
for _ in 0..n_samples / 3 {
feature_values.push(centers[0][j] + rng.random_range(-1.0..1.0));
}
// Data for Cluster B (center [5, 5, ...], large variance)
for _ in 0..n_samples / 3 {
feature_values.push(centers[1][j] + rng.random_range(-2.0..2.0));
}
// Data for Cluster C (center [-5, -5, ...], medium variance)
for _ in 0..n_samples / 3 {
feature_values.push(centers[2][j] + rng.random_range(-1.5..1.5));
}
// Add feature column to DataFrame
let column = Float64Column::with_name(feature_values, format!("feature{}", j + 1));
df.add_column(format!("feature{}", j + 1), Column::Float64(column))?;
}
// Add cluster labels
let mut cluster_labels = Vec::with_capacity(n_samples);
for cluster_idx in 0..3 {
for _ in 0..n_samples / 3 {
cluster_labels.push(clusters[cluster_idx].to_string());
}
}
let cluster_column = StringColumn::with_name(cluster_labels, "cluster");
df.add_column("cluster", Column::String(cluster_column))?;
Ok(df)
}
#[cfg(not(all(feature = "optimized", feature = "cuda")))]
fn main() {
println!("This example requires both \"optimized\" and \"cuda\" feature flags to be enabled.");
println!("Please recompile with:");
println!(" cargo run --example optimized_ml_dimension_reduction_example --features \"optimized cuda\"");
}