#[cfg(all(feature = "optimized", feature = "cuda"))]
use pandrs::column::{Column, ColumnTrait, Float64Column, StringColumn};
#[cfg(all(feature = "optimized", feature = "cuda"))]
use pandrs::ml::dimension::{TSNEInit, PCA, TSNE};
#[cfg(all(feature = "optimized", feature = "cuda"))]
use pandrs::ml::{Transformer, UnsupervisedModel};
#[cfg(all(feature = "optimized", feature = "cuda"))]
use pandrs::{error::Error, OptimizedDataFrame};
#[cfg(all(feature = "optimized", feature = "cuda"))]
use rand::Rng;
#[cfg(all(feature = "optimized", feature = "cuda"))]
#[allow(clippy::result_large_err)]
fn main() -> Result<(), Error> {
println!("Example of PandRS Dimension Reduction Algorithms");
println!("============================");
let df = create_sample_data()?;
println!(
"Original DataFrame: {:?} rows x {:?} columns",
df.row_count(),
df.column_names().len()
);
pca_example(&df)?;
tsne_example(&df)?;
Ok(())
}
#[cfg(all(feature = "optimized", feature = "cuda"))]
#[allow(clippy::result_large_err)]
fn pca_example(df: &OptimizedDataFrame) -> Result<(), Error> {
println!("\n==== PCA (Principal Component Analysis) ====");
let mut pca = PCA::new(2, true);
let pca_result = pca.fit_transform(df)?;
println!(
"\nDataFrame after PCA: {:?} rows x {:?} columns",
pca_result.row_count(),
pca_result.column_names().len()
);
if let Ok(pc1_view) = pca_result.column("PC1") {
if let Some(pc1) = pc1_view.as_float64() {
let values: Vec<f64> = (0..pc1.len())
.filter_map(|i| pc1.get(i).ok().flatten())
.collect();
let mean = if !values.is_empty() {
values.iter().sum::<f64>() / values.len() as f64
} else {
0.0
};
let std_dev = if values.len() > 1 {
let var = values.iter().map(|&x| (x - mean).powi(2)).sum::<f64>()
/ (values.len() - 1) as f64;
var.sqrt()
} else {
0.0
};
println!("\nMean of Principal Component 1: {:.4}", mean);
println!(
"Standard Deviation of Principal Component 1: {:.4}",
std_dev
);
}
}
if let Ok(pc2_view) = pca_result.column("PC2") {
if let Some(pc2) = pc2_view.as_float64() {
let values: Vec<f64> = (0..pc2.len())
.filter_map(|i| pc2.get(i).ok().flatten())
.collect();
let mean = if !values.is_empty() {
values.iter().sum::<f64>() / values.len() as f64
} else {
0.0
};
let std_dev = if values.len() > 1 {
let var = values.iter().map(|&x| (x - mean).powi(2)).sum::<f64>()
/ (values.len() - 1) as f64;
var.sqrt()
} else {
0.0
};
println!("Mean of Principal Component 2: {:.4}", mean);
println!(
"Standard Deviation of Principal Component 2: {:.4}",
std_dev
);
}
}
println!("\nExplained Variance Ratio:");
let explained_variance = pca.explained_variance;
let total_variance = explained_variance.iter().sum::<f64>();
let mut cumulative = 0.0;
let mut cumulative_ratios = Vec::new();
for (i, &variance) in explained_variance.iter().enumerate() {
let ratio = variance / total_variance;
cumulative += ratio;
cumulative_ratios.push(cumulative);
println!("PC{}: {:.4} ({:.2}%)", i + 1, ratio, ratio * 100.0);
}
println!("\nCumulative Explained Variance:");
for (i, ratio) in cumulative_ratios.iter().enumerate() {
println!("PC1-PC{}: {:.4} ({:.2}%)", i + 1, ratio, ratio * 100.0);
}
calculate_cluster_means(&pca_result, "PC1", "PC2")?;
println!("\n==== 3D PCA ====");
let mut pca3d = PCA::new(3, true);
let pca3d_result = pca3d.fit_transform(df)?;
println!(
"\nDataFrame after 3D PCA: {:?} rows x {:?} columns",
pca3d_result.row_count(),
pca3d_result.column_names().len()
);
println!("\nExplained Variance Ratio of 3D PCA:");
let explained_variance = pca3d.explained_variance;
let total_variance = explained_variance.iter().sum::<f64>();
let mut cumulative = 0.0;
let mut cumulative_ratios = Vec::new();
for (i, &variance) in explained_variance.iter().enumerate() {
let ratio = variance / total_variance;
cumulative += ratio;
cumulative_ratios.push(cumulative);
println!("PC{}: {:.4} ({:.2}%)", i + 1, ratio, ratio * 100.0);
}
let last_cumulative = cumulative_ratios.last().unwrap_or(&0.0);
println!(
"Cumulative Explained Variance: {:.4} ({:.2}%)",
last_cumulative,
last_cumulative * 100.0
);
Ok(())
}
#[cfg(all(feature = "optimized", feature = "cuda"))]
#[allow(clippy::result_large_err)]
fn tsne_example(df: &OptimizedDataFrame) -> Result<(), Error> {
println!("\n==== t-SNE (t-distributed Stochastic Neighbor Embedding) ====");
let mut tsne = TSNE::new();
tsne.n_components = 2;
tsne.perplexity = 30.0;
tsne.learning_rate = 200.0;
tsne.max_iter = 100;
tsne.init = TSNEInit::PCA;
println!("\nRunning t-SNE (this may take some time)...");
let tsne_result = tsne.fit_transform(df)?;
println!(
"DataFrame after t-SNE: {:?} rows x {:?} columns",
tsne_result.row_count(),
tsne_result.column_names().len()
);
calculate_cluster_means(&tsne_result, "TSNE1", "TSNE2")?;
Ok(())
}
#[cfg(all(feature = "optimized", feature = "cuda"))]
#[allow(clippy::result_large_err)]
fn calculate_cluster_means(df: &OptimizedDataFrame, x_col: &str, y_col: &str) -> Result<(), Error> {
if !df.column_names().contains(&"cluster".to_string()) {
return Ok(());
}
println!("\nMean Coordinates for Each Cluster:");
let cluster_view = df.column("cluster")?;
let x_view = df.column(x_col)?;
let y_view = df.column(y_col)?;
if let Some(cluster_col) = cluster_view.as_string() {
if let (Some(x_col), Some(y_col)) = (x_view.as_float64(), y_view.as_float64()) {
let mut clusters = Vec::new();
for i in 0..df.row_count() {
if let Ok(Some(cluster)) = cluster_col.get(i) {
let cluster_str = cluster.to_string();
if !clusters.contains(&cluster_str) {
clusters.push(cluster_str);
}
}
}
let mut cluster_sums = std::collections::HashMap::new();
for i in 0..df.row_count() {
if let (Ok(Some(cluster)), Ok(Some(x)), Ok(Some(y))) =
(cluster_col.get(i), x_col.get(i), y_col.get(i))
{
let entry = cluster_sums
.entry(cluster.to_string())
.or_insert((0.0, 0.0, 0));
entry.0 += x;
entry.1 += y;
entry.2 += 1;
}
}
for cluster in clusters {
if let Some(&(x_sum, y_sum, count)) = cluster_sums.get(&cluster) {
let x_mean = x_sum / count as f64;
let y_mean = y_sum / count as f64;
println!(
"Cluster {}: ({:.4}, {:.4}), {} samples",
cluster, x_mean, y_mean, count
);
}
}
}
}
Ok(())
}
#[cfg(all(feature = "optimized", feature = "cuda"))]
#[allow(clippy::result_large_err)]
fn create_sample_data() -> Result<OptimizedDataFrame, Error> {
let mut rng = rand::thread_rng();
let mut df = OptimizedDataFrame::new();
let n_samples = 300;
let n_features = 10;
let clusters = ["A", "B", "C"];
let centers = [
vec![0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], vec![5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0], vec![-5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0], ];
for j in 0..n_features {
let mut feature_values = Vec::with_capacity(n_samples);
for _ in 0..n_samples / 3 {
feature_values.push(centers[0][j] + rng.random_range(-1.0..1.0));
}
for _ in 0..n_samples / 3 {
feature_values.push(centers[1][j] + rng.random_range(-2.0..2.0));
}
for _ in 0..n_samples / 3 {
feature_values.push(centers[2][j] + rng.random_range(-1.5..1.5));
}
let column = Float64Column::with_name(feature_values, format!("feature{}", j + 1));
df.add_column(format!("feature{}", j + 1), Column::Float64(column))?;
}
let mut cluster_labels = Vec::with_capacity(n_samples);
for cluster_idx in 0..3 {
for _ in 0..n_samples / 3 {
cluster_labels.push(clusters[cluster_idx].to_string());
}
}
let cluster_column = StringColumn::with_name(cluster_labels, "cluster");
df.add_column("cluster", Column::String(cluster_column))?;
Ok(df)
}
#[cfg(not(all(feature = "optimized", feature = "cuda")))]
fn main() {
println!("This example requires both \"optimized\" and \"cuda\" feature flags to be enabled.");
println!("Please recompile with:");
println!(" cargo run --example optimized_ml_dimension_reduction_example --features \"optimized cuda\"");
}