torsh 0.1.3 - Docs.rs

//! Comprehensive Integration Tests for ToRSh Framework
//!
//! This module contains integration tests that validate:
//! - Cross-crate functionality and compatibility
//! - API consistency across modules
//! - End-to-end workflows spanning multiple crates
//! - Feature flag integration
//! - Version compatibility
//! - Memory management and performance
//! - Error handling and edge cases

use std::collections::HashMap;
use torsh::check_version;
use torsh::prelude::*;

/// Test basic tensor operations and prelude imports
#[test]
fn test_prelude_integration() {
    // Test that all basic types are available from prelude
    let _device = DeviceType::Cpu;
    let _dtype = DType::F32;
    let _shape = Shape::new(vec![2, 3]);

    // Test tensor creation functions
    let zeros_tensor = zeros::<f32>(&[2, 3]).unwrap();
    let ones_tensor = ones::<f32>(&[2, 3]).unwrap();
    let rand_tensor = randn::<f32>(&[2, 3]).unwrap();

    assert_eq!(zeros_tensor.shape().dims(), &[2, 3]);
    assert_eq!(ones_tensor.shape().dims(), &[2, 3]);
    assert_eq!(rand_tensor.shape().dims(), &[2, 3]);

    // Test that Result and TorshError are available
    let _result: Result<()> = Ok(());
}

/// Test tensor creation macros and convenience functions
#[test]
fn test_tensor_macros() {
    // Test basic tensor creation
    let data = vec![1.0f32, 2.0, 3.0, 4.0];
    let t1 = Tensor::from_data(data.clone(), vec![4], DeviceType::Cpu).unwrap();
    assert_eq!(t1.shape().dims(), &[4]);

    // Test 2D tensor creation
    let data2d = vec![1.0f32, 2.0, 3.0, 4.0];
    let t2 = Tensor::from_data(data2d, vec![2, 2], DeviceType::Cpu).unwrap();
    assert_eq!(t2.shape().dims(), &[2, 2]);

    // Test device creation
    let cpu_device = DeviceType::Cpu;
    assert_eq!(cpu_device, DeviceType::Cpu);

    // Test shape creation
    let shape_macro = Shape::new(vec![2, 3, 4]);
    assert_eq!(shape_macro.dims(), &[2, 3, 4]);
}

/// Test autograd integration across the framework
#[test]
fn test_autograd_integration() -> Result<()> {
    // Test basic gradient computation
    let data = vec![2.0f32, 3.0];
    let x = Tensor::from_data(data, vec![2], DeviceType::Cpu)
        .unwrap()
        .requires_grad_(true);

    // Simple computation for gradient test
    let y = x.sum().unwrap();

    // Test that basic operations work
    assert_eq!(y.shape().dims(), &[] as &[usize]);

    // Note: actual gradient computation depends on autograd implementation
    println!("Autograd integration test passed");

    // Test no_grad context
    let result = {
        let _guard = no_grad();
        let a = Tensor::from_data(vec![1.0f32, 2.0], vec![2], DeviceType::Cpu)
            .unwrap()
            .requires_grad_(true);
        let b_data = Tensor::from_data(vec![2.0f32, 3.0], vec![2], DeviceType::Cpu).unwrap();
        let b = a.mul(&b_data)?;
        b
    };

    assert!(!result.requires_grad());
    Ok(())
}

/// Test neural network module integration
#[cfg(feature = "nn")]
#[test]
fn test_neural_network_integration() -> Result<()> {
    use torsh::nn::*;

    // Create a simple MLP
    let linear1 = Linear::new(10, 20, true);
    let relu = ReLU::new();
    let linear2 = Linear::new(20, 5, true);

    // Test forward pass
    let input = randn::<f32>(&[4, 10]).unwrap();

    let x = linear1.forward(&input)?;
    assert_eq!(x.shape().dims(), &[4, 20]);

    let x = relu.forward(&x)?;
    assert_eq!(x.shape().dims(), &[4, 20]);

    let output = linear2.forward(&x)?;
    assert_eq!(output.shape().dims(), &[4, 5]);

    // Test Sequential container
    let model = Sequential::new().add(linear1).add(relu).add(linear2);

    let output2 = model.forward(&input)?;
    assert_eq!(output2.shape().dims(), &[4, 5]);

    // Test parameter collection
    let params = model.parameters();
    assert!(!params.is_empty());

    Ok(())
}

/// Test optimization integration
#[cfg(all(feature = "nn", feature = "optim"))]
#[test]
fn test_optimization_integration() -> Result<()> {
    use torsh::nn::*;
    use torsh::optim::*;

    // Create model
    let mut model = Sequential::new()
        .add(Linear::new(5, 10, true))
        .add(ReLU::new())
        .add(Linear::new(10, 1, true));

    // Set training mode to ensure parameters require gradients
    model.train();

    // Create optimizer with model parameters
    let _optimizer = Adam::new(
        model
            .parameters()
            .into_iter()
            .map(|(_name, param)| param.tensor())
            .collect(),
        Some(0.01),
        None,
        None,
        None,
        false,
    );

    // Simple training step - enable gradients on input for autograd
    let input = randn::<f32>(&[8, 5]).unwrap().requires_grad_(true);
    let _target = randn::<f32>(&[8, 1]).unwrap();

    // Forward pass
    let output = model.forward(&input)?;
    // Use simple sum of squares as loss (just for testing)
    let _loss = output.pow(2.0)?.mean(None, false)?;

    // Backward pass - temporarily disabled due to autograd limitations
    // TODO: Re-enable when autograd system supports neural network modules
    // optimizer.zero_grad();
    // loss.backward()?;
    // optimizer.step()?;

    // For now, just verify the forward pass and loss computation work
    println!("Optimization integration test passed (forward pass only)");

    // Check that gradients were computed
    // Note: Parameter gradient checking would be implemented here
    // when the Parameter struct has proper gradient access methods
    for (_name, _param) in model.parameters() {
        // Placeholder for gradient checking
    }

    Ok(())
}

/// Test functional operations integration
#[cfg(feature = "functional")]
#[test]
fn test_functional_integration() -> Result<()> {
    // Import functional operations directly
    use torsh::functional::{cross_entropy, relu, sigmoid, softmax, tanh};

    let input = randn::<f32>(&[2, 3, 4]).unwrap();

    // Test activation functions (note: relu takes an 'inplace' parameter)
    let relu_out = relu(&input, false)?;
    let sigmoid_out = sigmoid(&input)?;
    let tanh_out = tanh(&input)?;

    assert_eq!(relu_out.shape(), input.shape());
    assert_eq!(sigmoid_out.shape(), input.shape());
    assert_eq!(tanh_out.shape(), input.shape());

    // Test normalization
    let softmax_out = softmax(&input, -1, None)?;
    assert_eq!(softmax_out.shape(), input.shape());

    // Test loss functions
    let logits = randn::<f32>(&[4, 10]).unwrap();
    // Note: cross_entropy in torsh-functional expects both inputs to be f32
    let targets = Tensor::from_data(vec![1.0f32, 5.0, 2.0, 8.0], vec![4], DeviceType::Cpu).unwrap();

    let ce_loss = cross_entropy(&logits, &targets, None, "mean", None, 0.0)?;
    assert_eq!(ce_loss.shape().dims(), &[] as &[usize]);

    Ok(())
}

/// Test data loading integration
#[cfg(feature = "data")]
#[test]
fn test_data_loading_integration() -> Result<()> {
    use torsh::data::*;

    // Create synthetic dataset
    let data_size = 100;
    let feature_size = 10;

    // Create batch tensors with proper dimensions
    let data_tensor = randn::<f32>(&[data_size, feature_size]).unwrap();

    let targets_vec: Vec<f32> = (0..data_size).map(|i| (i % 5) as f32).collect();
    let targets_tensor =
        Tensor::from_data(targets_vec, vec![data_size, 1], DeviceType::Cpu).unwrap();

    // Create dataset with properly shaped tensors
    let dataset = TensorDataset::new(vec![data_tensor, targets_tensor]);

    // Create data loader
    let batch_size = 16;
    let dataloader = DataLoader::builder(dataset)
        .batch_size(batch_size)
        .shuffle(true)
        .num_workers(2)
        .build_with_random_sampling()?;

    let mut total_batches = 0;
    let mut total_samples = 0;

    for batch in dataloader.iter() {
        let batch_tensors = batch?;

        // With the new structure, we expect 2 tensors: data and targets
        if batch_tensors.len() == 2 {
            let batch_data = &batch_tensors[0];
            let batch_targets = &batch_tensors[1];

            assert_eq!(batch_data.shape().dims()[1], feature_size);
            let actual_batch_size = batch_data.shape().dims()[0];
            assert!(actual_batch_size <= batch_size);
            assert_eq!(batch_targets.shape().dims()[0], actual_batch_size);

            total_batches += 1;
            total_samples += actual_batch_size;
        }
    }

    assert_eq!(total_samples, data_size);
    assert!(total_batches > 0);

    Ok(())
}

/// Test sparse tensor integration
#[cfg(feature = "sparse")]
#[test]
fn test_sparse_integration() -> Result<()> {
    use torsh::sparse::*;

    // Create sparse COO tensor using the correct API
    let row_indices = vec![0, 1, 1];
    let col_indices = vec![2, 0, 2];
    let values = vec![3.0, 4.0, 5.0];
    let shape = Shape::new(vec![2, 3]);

    let sparse_tensor = CooTensor::new(row_indices, col_indices, values, shape)?;

    // Test sparse operations
    let dense = sparse_tensor.to_dense()?;
    assert_eq!(dense.shape().dims(), &[2, 3]);

    Ok(())
}

/// Test quantization integration
#[cfg(feature = "quantization")]
#[test]
fn test_quantization_integration() -> Result<()> {
    use torsh::quantization::*;

    let tensor = randn::<f32>(&[4, 4]).unwrap();

    // Test quantization
    let quantized = quantize_per_tensor(&tensor, 0.1, 0, DType::I8)?;
    let dequantized = dequantize(&quantized, 0.1, 0)?;

    assert_eq!(dequantized.shape(), tensor.shape());

    Ok(())
}

/// Test special functions integration
#[cfg(feature = "special")]
#[test]
fn test_special_functions_integration() -> Result<()> {
    use torsh::special::*;

    let x = tensor![0.5, 1.0, 2.0]?;

    // Test gamma function
    let gamma_result = gamma(&x)?;
    assert_eq!(gamma_result.shape(), x.shape());

    // Test error function
    let erf_result = erf(&x)?;
    assert_eq!(erf_result.shape(), x.shape());

    Ok(())
}

/// Test linear algebra integration
#[cfg(feature = "linalg")]
#[test]
fn test_linalg_integration() -> Result<()> {
    use torsh::linalg::*;

    let matrix = randn::<f32>(&[4, 4]).unwrap();

    // Test matrix decomposition
    let (q, r) = qr(&matrix)?;
    assert_eq!(q.shape().dims(), &[4, 4]);
    assert_eq!(r.shape().dims(), &[4, 4]);

    // Test determinant - returns f32, not Tensor
    let determinant = det(&matrix)?;
    // Just verify we can compute a determinant (it's a scalar f32)
    let _ = determinant;

    Ok(())
}

/// Test device compatibility and transfers
#[test]
fn test_device_integration() -> Result<()> {
    let cpu_device = DeviceType::Cpu;
    let tensor_cpu = randn::<f32>(&[10, 10]).unwrap().to_device(cpu_device)?;

    assert_eq!(tensor_cpu.device(), cpu_device);

    // Test operations on CPU
    let result_cpu = tensor_cpu.matmul(&tensor_cpu)?;
    assert_eq!(result_cpu.device(), cpu_device);
    assert_eq!(result_cpu.shape().dims(), &[10, 10]);

    Ok(())
}

/// Test memory management across operations
#[test]
fn test_memory_management() -> Result<()> {
    // Create large tensors and perform operations
    let large_tensors: Vec<Tensor> = (0..5).map(|_| randn::<f32>(&[100, 100]).unwrap()).collect();

    // Perform operations that create intermediate tensors
    let mut results = Vec::new();
    for i in 0..4 {
        let result = large_tensors[i].matmul(&large_tensors[i + 1])?;
        results.push(result);
    }

    // Verify results have correct shapes
    for result in &results {
        assert_eq!(result.shape().dims(), &[100, 100]);
    }

    // Drop references - memory should be freed
    drop(large_tensors);
    drop(results);

    Ok(())
}

/// Test error handling across modules
#[test]
fn test_error_handling() {
    // Test shape mismatch errors
    let a = randn::<f32>(&[2, 3]).unwrap();
    let b = randn::<f32>(&[4, 5]).unwrap();

    let result = a.matmul(&b);
    assert!(result.is_err());

    // Test invalid device handling
    // Note: DeviceType is an enum, so invalid device strings are caught at compile time
    let _device = DeviceType::Cpu; // This is always valid

    // Test invalid reshape
    let tensor = randn::<f32>(&[2, 3]).unwrap();
    let result = tensor.reshape(&[5]); // 6 elements cannot be reshaped to 5
    assert!(result.is_err());
}

/// Test version compatibility
#[test]
fn test_version_compatibility() -> Result<()> {
    // Test version checking
    assert!(check_version(0, 1).is_ok());
    assert!(check_version(1, 0).is_err()); // Future version should fail

    // Test crate version compatibility
    // Note: version module is not yet implemented
    // version::check_version_compatibility()?;

    // Test version info
    // Note: version module is not yet implemented
    // let versions = version::get_crate_versions();
    // assert!(!versions.is_empty());

    // for version in &versions {
    //     assert!(!version.name.is_empty());
    //     assert!(!version.version.is_empty());
    // }

    Ok(())
}

/// Test feature detection and requirements
#[test]
fn test_feature_integration() -> Result<()> {
    use torsh::features::*;

    // Get enabled features
    let features = get_enabled_features();
    assert!(!features.is_empty());

    // Check that core features are enabled
    let core_enabled = features.iter().any(|f| f.name == "std" && f.enabled);
    assert!(core_enabled);

    // Test feature requirements
    let core_requirements = ["std"]; // Core requirement
    assert!(check_feature_requirements(&core_requirements).is_ok());

    // Test feature stats
    let stats = get_feature_stats();
    assert!(!stats.is_empty());

    Ok(())
}

/// Test tensor indexing and slicing
#[test]
fn test_tensor_indexing() -> Result<()> {
    let tensor = randn::<f32>(&[4, 5, 6]).unwrap();

    // Test basic indexing
    let slice1 = tensor.slice(0, 0, 2)?; // First 2 elements along dim 0
    assert_eq!(slice1.shape().dims(), &[2, 5, 6]);

    let slice2 = tensor.slice(1, 1, 4)?; // Elements 1-3 along dim 1
    assert_eq!(slice2.shape().dims(), &[4, 3, 6]);

    // Test squeeze and unsqueeze
    let squeezed = tensor.unsqueeze(0)?.squeeze(0)?;
    assert_eq!(squeezed.shape().dims(), &[4, 5, 6]);

    Ok(())
}

/// Test advanced tensor operations
#[test]
fn test_advanced_tensor_ops() -> Result<()> {
    let a = randn::<f32>(&[3, 4]).unwrap();
    let b = randn::<f32>(&[4, 5]).unwrap();

    // Test matrix multiplication
    let c = a.matmul(&b)?;
    assert_eq!(c.shape().dims(), &[3, 5]);

    // Test broadcasting
    let d = randn::<f32>(&[3, 1]).unwrap();
    let e = a.add(&d)?; // Broadcasting
    assert_eq!(e.shape().dims(), &[3, 4]);

    // Test reduction operations
    let sum_all = a.sum()?;
    assert_eq!(sum_all.shape().dims(), &[] as &[usize]);

    let sum_dim = a.sum_dim(&[0], false)?;
    assert_eq!(sum_dim.shape().dims(), &[4]);

    let mean_val = a.mean(None, false)?;
    assert_eq!(mean_val.shape().dims(), &[] as &[usize]);

    Ok(())
}

/// Test end-to-end training workflow
#[cfg(all(feature = "nn", feature = "optim", feature = "data"))]
#[test]
fn test_end_to_end_workflow() -> Result<()> {
    use torsh::data::*;
    use torsh::nn::*;
    use torsh::optim::*;

    // Create model
    let mut model = Sequential::new()
        .add(Linear::new(10, 20, true))
        .add(ReLU::new())
        .add(Linear::new(20, 5, true));

    // Set training mode to ensure parameters require gradients
    model.train();

    // Create optimizer with model parameters
    let _optimizer = Adam::new(
        model
            .parameters()
            .into_iter()
            .map(|(_name, param)| param.tensor())
            .collect(),
        Some(0.001),
        None,
        None,
        None,
        false,
    );

    // Create synthetic dataset with proper batch dimensions
    let data_tensor = randn::<f32>(&[64, 10]).unwrap();
    let targets_vec: Vec<f32> = (0..64).map(|i| (i % 5) as f32).collect();
    let targets_tensor = Tensor::from_data(targets_vec, vec![64, 1], DeviceType::Cpu).unwrap();

    let dataset = TensorDataset::new(vec![data_tensor, targets_tensor]);
    let dataloader = DataLoader::builder(dataset).batch_size(16).build()?;

    // Training loop
    for batch in dataloader.iter() {
        let batch_tensors = batch?;

        // Get data and targets from the batch
        let batch_data = &batch_tensors[0]; // Data tensor [batch_size, 10]
        let _batch_targets = &batch_tensors[1]; // Targets tensor [batch_size, 1]

        // Enable gradients on batch data for autograd
        let batch_data = batch_data.clone().requires_grad_(true);

        // Forward pass
        let outputs = model.forward(&batch_data)?;
        // Use simple sum of squares as loss (just for testing)
        let loss = outputs.pow(2.0)?.mean(None, false)?;

        // Backward pass - temporarily disabled due to autograd limitations
        // TODO: Re-enable when autograd system supports neural network modules
        // optimizer.zero_grad();
        // loss.backward()?;
        // optimizer.step()?;

        // For now, just verify the forward pass and loss computation work
        println!("End-to-end workflow test passed (forward pass only)");

        // Verify loss is computed
        assert!(loss.item()? > 0.0);
        break; // Just test one batch
    }

    Ok(())
}

/// Test serialization and state management
#[cfg(feature = "nn")]
#[test]
fn test_serialization_integration() -> Result<()> {
    use torsh::nn::*;

    // Create model
    let model = Sequential::new()
        .add(Linear::new(5, 10, true))
        .add(ReLU::new())
        .add(Linear::new(10, 3, true));

    // Get state dict
    let state_dict = model.state_dict();
    assert!(!state_dict.is_empty());

    // Test parameter access
    let parameters = model.parameters();
    assert!(!parameters.is_empty());

    for param in &parameters {
        assert!(param.1.tensor().read().shape().numel() > 0);
    }

    Ok(())
}

/// Test performance characteristics
#[test]
fn test_performance_characteristics() -> Result<()> {
    let sizes = vec![10, 50, 100];
    let mut timings = HashMap::new();

    for size in sizes {
        let a = randn::<f32>(&[size, size]).unwrap();
        let b = randn::<f32>(&[size, size]).unwrap();

        // Time matrix multiplication
        let start = std::time::Instant::now();
        let _result = a.matmul(&b)?;
        let duration = start.elapsed();

        timings.insert(size, duration);
    }

    // Verify that operations complete in reasonable time
    assert!(timings[&100] < std::time::Duration::from_secs(1));

    Ok(())
}

/// Test tensor broadcasting rules
#[test]
fn test_broadcasting() -> Result<()> {
    // Test basic tensor operations that should work

    // Scalar operations
    let a = randn::<f32>(&[3, 4]).unwrap();
    let result = a.mul_scalar(2.0)?;
    assert_eq!(result.shape().dims(), &[3, 4]);

    // Same-shape operations
    let b = randn::<f32>(&[3, 4]).unwrap();
    let result2 = a.add(&b)?;
    assert_eq!(result2.shape().dims(), &[3, 4]);

    // Basic reduction operation
    let result3 = a.sum()?;
    assert_eq!(result3.shape().dims(), &[] as &[usize]);

    Ok(())
}

/// Test gradient flow in complex computational graphs
#[test]
fn test_complex_autograd() -> Result<()> {
    let x = tensor![1.0, 2.0]?.requires_grad_(true);
    let y = tensor![3.0, 4.0]?.requires_grad_(true);

    // Complex computational graph
    let z1 = x.pow(2.0)?;
    let z2 = y.mul_scalar(2.0)?;
    let z3 = z1.add(&z2)?;
    let loss = z3.sum()?;

    loss.backward()?;

    // Just verify that backward completes without error
    // Gradient checking would require proper autograd implementation
    assert!(loss.item()? > 0.0);

    Ok(())
}

/// Main integration test runner
#[test]
fn run_comprehensive_integration_tests() {
    println!("=== Running ToRSh Comprehensive Integration Tests ===");

    // Core functionality tests
    test_prelude_integration();
    test_tensor_macros();
    test_autograd_integration().expect("Autograd integration failed");
    test_device_integration().expect("Device integration failed");
    test_memory_management().expect("Memory management failed");
    test_error_handling();
    test_version_compatibility().expect("Version compatibility failed");
    test_feature_integration().expect("Feature integration failed");
    test_tensor_indexing().expect("Tensor indexing failed");
    test_advanced_tensor_ops().expect("Advanced tensor ops failed");
    test_broadcasting().expect("Broadcasting failed");
    test_complex_autograd().expect("Complex autograd failed");
    test_performance_characteristics().expect("Performance test failed");

    // Feature-dependent tests
    #[cfg(feature = "nn")]
    {
        test_neural_network_integration().expect("Neural network integration failed");
        test_serialization_integration().expect("Serialization integration failed");
    }

    #[cfg(all(feature = "nn", feature = "optim"))]
    test_optimization_integration().expect("Optimization integration failed");

    #[cfg(feature = "functional")]
    test_functional_integration().expect("Functional integration failed");

    #[cfg(feature = "data")]
    test_data_loading_integration().expect("Data loading integration failed");

    #[cfg(feature = "sparse")]
    test_sparse_integration().expect("Sparse integration failed");

    #[cfg(feature = "quantization")]
    test_quantization_integration().expect("Quantization integration failed");

    #[cfg(feature = "special")]
    test_special_functions_integration().expect("Special functions integration failed");

    #[cfg(feature = "linalg")]
    test_linalg_integration().expect("Linear algebra integration failed");

    #[cfg(all(feature = "nn", feature = "optim", feature = "data"))]
    test_end_to_end_workflow().expect("End-to-end workflow failed");

    println!("\n=== All Comprehensive Integration Tests Passed! ===");
}

// Helper functions for testing

/// Helper function to verify tensor values are close
#[allow(dead_code)]
fn assert_tensors_close(a: &Tensor, b: &Tensor, tolerance: f64) -> Result<()> {
    assert_eq!(a.shape(), b.shape());

    let diff = a.sub(b)?;
    let max_diff = diff.abs()?.max(None, false)?.item()? as f64;

    if max_diff > tolerance {
        return Err(TorshError::Other(format!(
            "Tensors not close: max difference {} > tolerance {}",
            max_diff, tolerance
        )));
    }

    Ok(())
}

/// Helper function to create test data
#[allow(dead_code)]
fn create_test_dataset(
    size: usize,
    input_dim: usize,
    num_classes: usize,
) -> (Vec<Tensor>, Vec<Tensor>) {
    let data: Vec<Tensor> = (0..size)
        .map(|_| randn::<f32>(&[input_dim]).unwrap())
        .collect();
    let targets: Vec<Tensor> = (0..size)
        .map(|_| {
            let target = (size % num_classes) as f32;
            Tensor::from_data(vec![target], vec![1], DeviceType::Cpu).unwrap()
        })
        .collect();

    (data, targets)
}

/// Helper to measure execution time
fn time_execution<F, R>(f: F) -> (R, std::time::Duration)
where
    F: FnOnce() -> R,
{
    let start = std::time::Instant::now();
    let result = f();
    let duration = start.elapsed();
    (result, duration)
}

// Benchmark tests for comprehensive performance validation
#[cfg(test)]
mod benchmark_tests {
    use super::*;

    #[test]
    fn benchmark_basic_operations() {
        let sizes = vec![32, 64, 128, 256];

        for size in sizes {
            let a = randn::<f32>(&[size, size]).unwrap();
            let b = randn::<f32>(&[size, size]).unwrap();

            // Matrix multiplication benchmark
            let (_, duration) = time_execution(|| {
                let _result = a.matmul(&b);
                _result
            });
            println!("Matrix multiplication {}x{}: {:?}", size, size, duration);

            // Element-wise operations benchmark
            let (_, duration) = time_execution(|| {
                let _result = a.add(&b);
                _result
            });
            println!("Element-wise addition {}x{}: {:?}", size, size, duration);
        }
    }

    #[cfg(feature = "nn")]
    #[test]
    fn benchmark_neural_network() {
        use torsh::nn::*;

        let model = Sequential::new()
            .add(Linear::new(784, 512, true))
            .add(ReLU::new())
            .add(Linear::new(512, 256, true))
            .add(ReLU::new())
            .add(Linear::new(256, 10, true));

        let input = randn::<f32>(&[64, 784]).unwrap();

        let (_, duration) = time_execution(|| model.forward(&input).unwrap());
        println!("Neural network forward pass (batch=64): {:?}", duration);
    }
}