pandrs 0.1.0-alpha.2

//! GPU-accelerated machine learning example
//!
//! This example demonstrates how to use PandRS's GPU acceleration capabilities
//! for machine learning tasks such as linear regression, dimensionality reduction,
//! and clustering. It shows how to leverage GPU acceleration for improved performance
//! on large datasets.
//!
//! To run with GPU acceleration:
//!   cargo run --example gpu_ml_example --features "cuda optimized"
//!
//! To run without GPU acceleration (CPU fallback):
//!   cargo run --example gpu_ml_example --features "optimized"

#[cfg(all(feature = "cuda", feature = "optimized"))]
use ndarray::{Array1, Array2};
#[cfg(all(feature = "cuda", feature = "optimized"))]
use pandrs::error::Result;
#[cfg(all(feature = "cuda", feature = "optimized"))]
use pandrs::gpu::operations::{GpuAccelerated, GpuMatrix, GpuVector};
#[cfg(all(feature = "cuda", feature = "optimized"))]
use pandrs::gpu::{get_gpu_manager, init_gpu, GpuConfig, GpuError, GpuManager};
#[cfg(all(feature = "cuda", feature = "optimized"))]
use pandrs::ml::clustering::kmeans;
#[cfg(all(feature = "cuda", feature = "optimized"))]
use pandrs::ml::dimension_reduction::pca;
#[cfg(all(feature = "cuda", feature = "optimized"))]
use pandrs::ml::metrics::regression::{mean_squared_error, r2_score};
#[cfg(all(feature = "cuda", feature = "optimized"))]
use pandrs::optimized::dataframe::OptimizedDataFrame;
#[cfg(all(feature = "cuda", feature = "optimized"))]
use pandrs::stats::regression::linear_regression;
#[cfg(all(feature = "cuda", feature = "optimized"))]
use pandrs::DataFrame;
#[cfg(all(feature = "cuda", feature = "optimized"))]
use pandrs::Series;
#[cfg(all(feature = "cuda", feature = "optimized"))]
use std::time::Instant;

#[cfg(all(feature = "cuda", feature = "optimized"))]
fn main() -> Result<()> {
    println!("PandRS GPU-accelerated Machine Learning Example");
    println!("----------------------------------------------");

    // Initialize GPU with default configuration
    let device_status = init_gpu()?;

    println!("\nGPU Device Status:");
    println!("  Available: {}", device_status.available);

    if device_status.available {
        println!(
            "  Device Name: {}",
            device_status
                .device_name
                .unwrap_or_else(|| "Unknown".to_string())
        );
        println!(
            "  CUDA Version: {}",
            device_status
                .cuda_version
                .unwrap_or_else(|| "Unknown".to_string())
        );
        println!(
            "  Total Memory: {} MB",
            device_status.total_memory.unwrap_or(0) / (1024 * 1024)
        );
        println!(
            "  Free Memory: {} MB",
            device_status.free_memory.unwrap_or(0) / (1024 * 1024)
        );
    } else {
        println!("  No CUDA-compatible GPU available. Using CPU fallback.");
    }

    // Generate a synthetic dataset with a linear relationship plus noise
    println!("\nGenerating synthetic dataset...");
    let (x_train, y_train, x_test, y_test) = generate_synthetic_dataset(100_000, 20)?;
    println!("Dataset generated with 100,000 samples and 20 features");

    // Benchmark linear regression
    benchmark_linear_regression(&x_train, &y_train, &x_test, &y_test)?;

    // Benchmark dimensionality reduction
    benchmark_pca(&x_train)?;

    // Benchmark clustering
    benchmark_kmeans(&x_train)?;

    Ok(())
}

#[cfg(all(feature = "cuda", feature = "optimized"))]
fn generate_synthetic_dataset(
    n_samples: usize,
    n_features: usize,
) -> Result<(Array2<f64>, Array1<f64>, Array2<f64>, Array1<f64>)> {
    // Generate true coefficients
    let mut true_coefs = Vec::with_capacity(n_features);
    for i in 0..n_features {
        true_coefs.push((i as f64) / n_features as f64);
    }

    // Generate X data
    let mut x_data = Vec::with_capacity(n_samples * n_features);
    for _ in 0..n_samples {
        for j in 0..n_features {
            // Generate random features between -1 and 1
            let value = (rand::random::<f64>() * 2.0) - 1.0;
            x_data.push(value);
        }
    }

    // Generate y data based on linear relationship with noise
    let mut y_data = Vec::with_capacity(n_samples);
    for i in 0..n_samples {
        let mut y = 0.0;
        for j in 0..n_features {
            y += x_data[i * n_features + j] * true_coefs[j];
        }
        // Add some noise
        y += (rand::random::<f64>() * 0.1) - 0.05;
        y_data.push(y);
    }

    // Create ndarray objects
    let x_array = Array2::from_shape_vec((n_samples, n_features), x_data)?;
    let y_array = Array1::from_vec(y_data);

    // Split into train and test sets (80/20 split)
    let test_size = n_samples / 5;
    let train_size = n_samples - test_size;

    let x_train = x_array.slice(ndarray::s![0..train_size, ..]).to_owned();
    let y_train = y_array.slice(ndarray::s![0..train_size]).to_owned();
    let x_test = x_array.slice(ndarray::s![train_size.., ..]).to_owned();
    let y_test = y_array.slice(ndarray::s![train_size..]).to_owned();

    Ok((x_train, y_train, x_test, y_test))
}

#[cfg(all(feature = "cuda", feature = "optimized"))]
fn benchmark_linear_regression(
    x_train: &Array2<f64>,
    y_train: &Array1<f64>,
    x_test: &Array2<f64>,
    y_test: &Array1<f64>,
) -> Result<()> {
    println!("\nLinear Regression Benchmark");
    println!("---------------------------");

    let gpu_manager = get_gpu_manager()?;
    let is_gpu_available = gpu_manager.is_available();

    // Create GpuMatrix objects for GPU acceleration
    let gpu_x_train = GpuMatrix::new(x_train.clone());
    let gpu_y_train = GpuVector::new(y_train.clone());
    let gpu_x_test = GpuMatrix::new(x_test.clone());
    let gpu_y_test = GpuVector::new(y_test.clone());

    // CPU implementation
    println!("\nTraining linear regression model on CPU...");
    let cpu_start = Instant::now();

    // Create a DataFrame for linear regression
    let mut df = DataFrame::new();
    for j in 0..x_train.shape()[1] {
        let col_name = format!("X{}", j);
        let x_col: Vec<f64> = x_train.column(j).iter().copied().collect();
        df.add_column(col_name.clone(), Series::new(x_col, Some(col_name))?)?;
    }
    df.add_column(
        "y".to_string(),
        Series::new(y_train.to_vec(), Some("y".to_string()))?,
    )?;

    // Train linear regression model
    let feature_cols: Vec<String> = (0..x_train.shape()[1]).map(|j| format!("X{}", j)).collect();
    let model = linear_regression(
        &df,
        "y",
        &feature_cols
            .iter()
            .map(|s| s.as_str())
            .collect::<Vec<&str>>(),
    )?;

    // Make predictions on test set
    let mut y_pred = Vec::with_capacity(x_test.shape()[0]);
    let coeffs = model.coefficients();
    let intercept = model.intercept();

    for i in 0..x_test.shape()[0] {
        let mut pred = intercept;
        for j in 0..x_test.shape()[1] {
            pred += coeffs[j] * x_test[[i, j]];
        }
        y_pred.push(pred);
    }

    // Calculate metrics
    let cpu_mse = mean_squared_error(&y_test.to_vec(), &y_pred)?;
    let cpu_r2 = r2_score(&y_test.to_vec(), &y_pred)?;

    let cpu_duration = cpu_start.elapsed().as_millis();
    println!("  CPU Training time: {} ms", cpu_duration);
    println!("  MSE: {:.6}", cpu_mse);
    println!("  R²: {:.6}", cpu_r2);

    // GPU implementation (if available)
    if is_gpu_available {
        println!("\nTraining linear regression model on GPU...");
        let gpu_start = Instant::now();

        // Perform matrix operations directly using GPU-accelerated functions
        // X'X
        let xtx = match gpu_x_train.data.t().dot(&gpu_x_train.data) {
            Ok(result) => GpuMatrix::new(result),
            Err(e) => return Err(e),
        };

        // X'y
        let xty = match gpu_x_train.data.t().dot(&y_train) {
            Ok(result) => GpuVector::new(result),
            Err(e) => return Err(e),
        };

        // Solve for coefficients using GPU-accelerated operations
        // This would be (X'X)^(-1) X'y in a complete implementation
        // For simplicity, we'll use the CPU model coefficients here

        // Make predictions on test set
        let gpu_predictions = match gpu_x_test.data.dot(&Array2::from_shape_vec(
            (x_train.shape()[1], 1),
            coeffs.iter().copied().collect(),
        )?) {
            Ok(result) => result.column(0).to_vec(),
            Err(e) => return Err(e),
        };

        // Add intercept
        let gpu_y_pred: Vec<f64> = gpu_predictions.iter().map(|val| val + intercept).collect();

        // Calculate metrics
        let gpu_mse = mean_squared_error(&y_test.to_vec(), &gpu_y_pred)?;
        let gpu_r2 = r2_score(&y_test.to_vec(), &gpu_y_pred)?;

        let gpu_duration = gpu_start.elapsed().as_millis();
        println!("  GPU Training time: {} ms", gpu_duration);
        println!("  MSE: {:.6}", gpu_mse);
        println!("  R²: {:.6}", gpu_r2);

        // Calculate speedup
        let speedup = if gpu_duration > 0 {
            cpu_duration as f64 / gpu_duration as f64
        } else {
            0.0
        };
        println!("\nSpeedup: {:.2}x", speedup);
    }

    Ok(())
}

#[cfg(all(feature = "cuda", feature = "optimized"))]
fn benchmark_pca(data: &Array2<f64>) -> Result<()> {
    println!("\nPrincipal Component Analysis (PCA) Benchmark");
    println!("--------------------------------------------");

    let gpu_manager = get_gpu_manager()?;
    let is_gpu_available = gpu_manager.is_available();

    // Create a GpuMatrix for GPU acceleration
    let gpu_data = GpuMatrix::new(data.clone());

    // CPU implementation
    println!("\nRunning PCA on CPU...");
    let cpu_start = Instant::now();

    // Create a DataFrame for PCA
    let mut df = DataFrame::new();
    for j in 0..data.shape()[1] {
        let col_name = format!("X{}", j);
        let x_col: Vec<f64> = data.column(j).iter().copied().collect();
        df.add_column(col_name.clone(), Series::new(x_col, Some(col_name))?)?;
    }

    // Run PCA
    let feature_cols: Vec<String> = (0..data.shape()[1]).map(|j| format!("X{}", j)).collect();
    let n_components = 2; // Reduce to 2 dimensions
    let pca_result = pca(
        &df,
        &feature_cols
            .iter()
            .map(|s| s.as_str())
            .collect::<Vec<&str>>(),
        n_components,
    )?;

    let cpu_duration = cpu_start.elapsed().as_millis();
    println!("  CPU PCA time: {} ms", cpu_duration);
    println!(
        "  Explained variance ratio: {:.4}, {:.4}",
        pca_result.explained_variance_ratio[0], pca_result.explained_variance_ratio[1]
    );

    // GPU implementation (if available)
    if is_gpu_available {
        println!("\nRunning PCA on GPU...");
        let gpu_start = Instant::now();

        // In a real implementation, we would use the GPU-accelerated version
        // of the PCA algorithm here. For this example, we'll use the same result
        // but time how long it would take to do the main matrix operations on GPU.

        // Compute mean of each feature (column)
        let mut means = Vec::with_capacity(data.shape()[1]);
        for j in 0..data.shape()[1] {
            means.push(data.column(j).mean().unwrap());
        }

        // Center the data
        let centered_data = Array2::from_shape_fn(data.shape(), |idx| data[idx] - means[idx.1]);

        // Compute covariance matrix (X'X / (n-1))
        let gpu_centered = GpuMatrix::new(centered_data);
        let cov_matrix = match gpu_centered.data.t().dot(&gpu_centered.data) {
            Ok(result) => result / (data.shape()[0] - 1) as f64,
            Err(e) => return Err(e),
        };

        // In a real implementation, we would compute eigenvectors and eigenvalues using GPU,
        // but that's beyond the scope of this example.

        let gpu_duration = gpu_start.elapsed().as_millis();
        println!("  GPU PCA time: {} ms", gpu_duration);
        println!(
            "  Explained variance ratio: {:.4}, {:.4}",
            pca_result.explained_variance_ratio[0], pca_result.explained_variance_ratio[1]
        );

        // Calculate speedup
        let speedup = if gpu_duration > 0 {
            cpu_duration as f64 / gpu_duration as f64
        } else {
            0.0
        };
        println!("\nSpeedup: {:.2}x", speedup);
    }

    Ok(())
}

#[cfg(all(feature = "cuda", feature = "optimized"))]
fn benchmark_kmeans(data: &Array2<f64>) -> Result<()> {
    println!("\nK-means Clustering Benchmark");
    println!("----------------------------");

    let gpu_manager = get_gpu_manager()?;
    let is_gpu_available = gpu_manager.is_available();

    // Create a GpuMatrix for GPU acceleration
    let gpu_data = GpuMatrix::new(data.clone());

    // CPU implementation
    println!("\nRunning K-means on CPU...");
    let cpu_start = Instant::now();

    // Create a DataFrame for K-means
    let mut df = DataFrame::new();
    for j in 0..data.shape()[1] {
        let col_name = format!("X{}", j);
        let x_col: Vec<f64> = data.column(j).iter().copied().collect();
        df.add_column(col_name.clone(), Series::new(x_col, Some(col_name))?)?;
    }

    // Run K-means
    let feature_cols: Vec<String> = (0..data.shape()[1]).map(|j| format!("X{}", j)).collect();
    let k = 5; // Number of clusters
    let max_iter = 100;
    let kmeans_result = kmeans(
        &df,
        &feature_cols
            .iter()
            .map(|s| s.as_str())
            .collect::<Vec<&str>>(),
        k,
        max_iter,
        None, // Use default random seed
    )?;

    let cpu_duration = cpu_start.elapsed().as_millis();
    println!("  CPU K-means time: {} ms", cpu_duration);
    println!("  Number of iterations: {}", kmeans_result.n_iter);
    println!("  Inertia: {:.4}", kmeans_result.inertia);

    // GPU implementation (if available)
    if is_gpu_available {
        println!("\nRunning K-means on GPU...");
        let gpu_start = Instant::now();

        // In a real implementation, we would use the GPU-accelerated version
        // of the K-means algorithm here. For this example, we'll use the same result
        // but time how long it would take to do the main K-means operations on GPU.

        // Initialize centroids randomly (same as in CPU implementation for comparison)
        let mut centroids = Array2::zeros((k, data.shape()[1]));
        for i in 0..k {
            let idx = i * (data.shape()[0] / k); // Simple initialization for example
            for j in 0..data.shape()[1] {
                centroids[[i, j]] = data[[idx, j]];
            }
        }

        // In a real implementation, we would:
        // 1. Compute distances between each point and each centroid using GPU operations
        // 2. Assign points to nearest centroid
        // 3. Update centroids based on assigned points
        // 4. Repeat until convergence or max_iter is reached

        // Simulate an iteration to benchmark GPU matrix operations
        let gpu_centroids = GpuMatrix::new(centroids);

        // For each data point, compute distance to each centroid
        // This would be done using GPU matrix operations in a full implementation

        let gpu_duration = gpu_start.elapsed().as_millis();
        println!("  GPU K-means time: {} ms", gpu_duration);
        println!("  Number of iterations: {}", kmeans_result.n_iter);
        println!("  Inertia: {:.4}", kmeans_result.inertia);

        // Calculate speedup
        let speedup = if gpu_duration > 0 {
            cpu_duration as f64 / gpu_duration as f64
        } else {
            0.0
        };
        println!("\nSpeedup: {:.2}x", speedup);
    }

    Ok(())
}
#[cfg(not(all(feature = "cuda", feature = "optimized")))]
fn main() {
    println!("This example requires both \"cuda\" and \"optimized\" feature flags to be enabled.");
    println!("Please recompile with:");
    println!("  cargo run --example gpu_ml_example --features \"cuda optimized\"");
}