scirs2-neural 0.2.0

//! Neural network layers implementation
//!
//! This module provides implementations of various neural network layers
//! such as dense (fully connected), attention, convolution, pooling, etc.
//! Layers are the fundamental building blocks of neural networks.
//! # Overview
//! Neural network layers transform input data through learned parameters (weights and biases).
//! Each layer implements the `Layer` trait, which defines the interface for forward and
//! backward propagation, parameter management, and training/evaluation modes.
//! # Available Layer Types
//! ## Core Layers
//! - **Dense**: Fully connected linear transformation
//! - **Conv2D**: 2D convolutional layers for image processing
//! - **Embedding**: Lookup tables for discrete inputs (words, tokens)
//! ## Activation & Regularization
//! - **Dropout**: Randomly sets inputs to zero during training
//! - **BatchNorm/LayerNorm**: Normalization for stable training
//! - **ActivityRegularization**: L1/L2 penalties on activations
//! ## Pooling & Reshaping
//! - **MaxPool2D/AdaptiveMaxPool2D**: Spatial downsampling
//! - **GlobalAvgPool2D**: Global spatial average pooling
//! ## Attention & Sequence
//! - **MultiHeadAttention**: Transformer-style attention mechanism
//! - **LSTM/GRU**: Recurrent layers for sequences
//! - **Bidirectional**: Wrapper for bidirectional RNNs
//! ## Embedding & Positional
//! - **PositionalEmbedding**: Learned positional encodings
//! - **PatchEmbedding**: Convert image patches to embeddings
//! # Examples
//! ## Creating a Simple Dense Layer
//! ```rust
//! use scirs2_neural::layers::{Layer, Dense};
//! use ndarray::Array;
//! use rand::rngs::SmallRng;
//! use rand::SeedableRng;
//! # fn example() -> scirs2_neural::error::Result<()> {
//! let mut rng = rand::rng();
//! // Create a dense layer: 784 inputs -> 128 outputs with ReLU activation
//! let dense = Dense::<f64>::new(784, 128, Some("relu"), &mut rng)?;
//! // Create input batch (batch_size=2, features=784)
//! let input = Array::zeros((2, 784)).into_dyn();
//! // Forward pass
//! let output = dense.forward(&input)?;
//! assert_eq!(output.shape(), &[2, 128]);
//! println!("Layer type: {}", dense.layer_type());
//! println!("Parameters: {}", dense.parameter_count());
//! # Ok(())
//! # }
//! ```
//! ## Building a Sequential Model
//! use scirs2_neural::layers::{Layer, Dense, Dropout};
//! use scirs2_neural::models::{Sequential, Model};
//! let mut model: Sequential<f32> = Sequential::new();
//! // Build a multi-layer network
//! model.add_layer(Dense::<f32>::new(784, 512, Some("relu"), &mut rng)?);
//! model.add_layer(Dropout::<f32>::new(0.2, &mut rng)?);
//! model.add_layer(Dense::<f32>::new(512, 256, Some("relu"), &mut rng)?);
//! model.add_layer(Dense::<f32>::new(256, 10, Some("softmax"), &mut rng)?);
//! // Input: batch of MNIST-like images (batch_size=32, flattened=784)
//! let input = Array::zeros((32, 784)).into_dyn();
//! // Forward pass through entire model
//! let output = model.forward(&input)?;
//! assert_eq!(output.shape(), &[32, 10]); // 10-class predictions
//! println!("Model has {} layers", model.num_layers());
//! let total_params: usize = model.layers().iter().map(|l| l.parameter_count()).sum();
//! println!("Total parameters: {}", total_params);
//! ## Using Convolutional Layers
//! use scirs2_neural::layers::{Layer, Conv2D, MaxPool2D, PaddingMode};
//! // Create conv layer: 3 input channels -> 32 output channels, 3x3 kernel
//! let conv = Conv2D::<f64>::new(3, 32, (3, 3), (1, 1), PaddingMode::Same, &mut rng)?;
//! let pool = MaxPool2D::<f64>::new((2, 2), (2, 2), None)?; // 2x2 max pooling
//! // Input: batch of RGB images (batch=4, channels=3, height=32, width=32)
//! let input = Array::zeros((4, 3, 32, 32)).into_dyn();
//! // Apply convolution then pooling
//! let conv_out = conv.forward(&input)?;
//! assert_eq!(conv_out.shape(), &[4, 32, 32, 32]); // Same padding preserved size
//! let pool_out = pool.forward(&conv_out)?;
//! assert_eq!(pool_out.shape(), &[4, 32, 16, 16]); // Pooling halved spatial dims
//! ## Training vs Evaluation Mode
//! use scirs2_neural::layers::{Layer, Dropout, BatchNorm};
//! let dropout = Dropout::<f64>::new(0.5, &mut rng)?;
//! let mut batchnorm = BatchNorm::<f64>::new(128, 0.9, 1e-5, &mut rng)?;
//! let input = Array::ones((10, 128)).into_dyn();
//! // Training mode (default)
//! assert!(dropout.is_training());
//! let train_output = dropout.forward(&input)?;
//! // Some outputs will be zero due to dropout
//! // Switch to evaluation mode (dropout is immutable in this example)
//! batchnorm.set_training(false);
//! let eval_output = dropout.forward(&input)?;
//! // No dropout applied, all outputs preserved but scaled
//! ## Custom Layer Implementation
//! use scirs2_neural::layers::Layer;
//! use scirs2_neural::error::Result;
//! use ndarray::{Array, ArrayD, ScalarOperand};
//! use num_traits::Float;
//! use std::fmt::Debug;
//! // Custom activation layer that squares the input
//! struct SquareLayer;
//! impl<F: Float + Debug + ScalarOperand> Layer<F> for SquareLayer {
//!     fn forward(&self, input: &ArrayD<F>) -> Result<ArrayD<F>> {
//!         Ok(input.mapv(|x| x * x))
//!     }
//!     fn backward(&self, input: &ArrayD<F>, grad_output: &ArrayD<F>) -> Result<ArrayD<F>> {
//!         // Derivative of x^2 is 2x
//!         Ok(grad_output * &input.mapv(|x| x + x))
//!     fn update(&mut self, _learning_rate: F) -> Result<()> {
//!         Ok(()) // No parameters to update
//!     fn as_any(&self) -> &dyn std::any::Any { self }
//!     fn as_any_mut(&mut self) -> &mut dyn std::any::Any { self }
//!     fn layer_type(&self) -> &str { "Square" }
//! }
//! # Layer Design Patterns
//! ## Parameter Initialization
//! Most layers use random number generators for weight initialization:
//! - **Xavier/Glorot**: Good for tanh/sigmoid activations
//! - **He/Kaiming**: Better for ReLU activations
//! - **Random Normal**: Simple baseline
//! ## Memory Management
//! - Use `set_training(false)` during inference to disable dropout and enable batch norm inference
//! - Sequential containers manage memory efficiently by reusing intermediate buffers
//! - Large models benefit from gradient checkpointing (available in memory_efficient module)
//! ## Gradient Flow
//! - Always implement both `forward` and `backward` methods
//! - The `backward` method should compute gradients w.r.t. inputs and update internal parameter gradients
//! - Use `update` method to apply gradients with learning rate

use crate::error::Result;
use rand::rng;
use ndarray::{Array, ScalarOperand};
use num_traits::Float;
use std::fmt::Debug;
/// Base trait for neural network layers
///
/// This trait defines the core interface that all neural network layers must implement.
/// It supports forward propagation, backpropagation, parameter management, and
/// training/evaluation mode switching.
/// # Core Methods
/// - `forward`: Compute layer output given input
/// - `backward`: Compute gradients for backpropagation  
/// - `update`: Apply parameter updates using computed gradients
/// - `set_training`/`is_training`: Control training vs evaluation behavior
/// # Examples
/// ```rust
/// use scirs2_neural::layers::{Layer, Dense};
/// use ndarray::Array;
/// use rand::rngs::SmallRng;
/// use rand::SeedableRng;
/// # fn example() -> scirs2_neural::error::Result<()> {
/// let mut rng = rand::rng();
/// let mut layer = Dense::<f64>::new(10, 5, None, &mut rng)?;
/// let input = Array::zeros((2, 10)).into_dyn();
/// let output = layer.forward(&input)?;
/// assert_eq!(output.shape(), &[2, 5]);
/// // Check layer properties
/// println!("Layer type: {}", layer.layer_type());
/// println!("Parameter count: {}", layer.parameter_count());
/// println!("Training mode: {}", layer.is_training());
/// # Ok(())
/// # }
/// ```
pub trait Layer<F: Float + Debug + ScalarOperand>: Send + Sync {
    /// Forward pass of the layer
    ///
    /// Computes the output of the layer given an input tensor. This method
    /// applies the layer's transformation (e.g., linear transformation, convolution,
    /// activation function) to the input.
    /// # Arguments
    /// * `input` - Input tensor with arbitrary dimensions
    /// # Returns
    /// Output tensor after applying the layer's transformation
    /// # Examples
    /// ```rust
    /// use scirs2_neural::layers::{Layer, Dense};
    /// use ndarray::Array;
    /// use rand::rngs::SmallRng;
    /// use rand::SeedableRng;
    /// # fn example() -> scirs2_neural::error::Result<()> {
    /// let mut rng = rand::rng();
    /// let layer = Dense::<f64>::new(3, 2, Some("relu"), &mut rng)?;
    /// let input = Array::from_shape_vec((1, 3), vec![1.0, 2.0, 3.0])?.into_dyn();
    /// let output = layer.forward(&input)?;
    /// assert_eq!(output.shape(), &[1, 2]);
    /// # Ok(())
    /// # }
    /// ```
    fn forward(&self, input: &Array<F, ndarray::IxDyn>) -> Result<Array<F, ndarray::IxDyn>>;
    /// Backward pass of the layer to compute gradients
    /// Computes gradients with respect to the layer's input, which is needed
    /// for backpropagation. This method also typically updates the layer's
    /// internal parameter gradients.
    /// * `input` - Original input to the forward pass
    /// * `grad_output` - Gradient of loss with respect to this layer's output
    /// Gradient of loss with respect to this layer's input
    /// let layer = Dense::<f64>::new(3, 2, None, &mut rng)?;
    /// let input = Array::zeros((1, 3)).into_dyn();
    /// let grad_output = Array::ones((1, 2)).into_dyn();
    /// let grad_input = layer.backward(&input, &grad_output)?;
    /// assert_eq!(grad_input.shape(), input.shape());
    fn backward(
        &self,
        input: &Array<F, ndarray::IxDyn>,
        grad_output: &Array<F, ndarray::IxDyn>,
    ) -> Result<Array<F, ndarray::IxDyn>>;
    /// Update the layer parameters with the given gradients
    /// Applies parameter updates using the provided learning rate and the
    /// gradients computed during the backward pass. This is typically called
    /// by optimizers.
    /// * `learning_rate` - Step size for parameter updates
    /// let mut layer = Dense::<f64>::new(3, 2, None, &mut rng)?;
    /// // Simulate forward/backward pass
    /// let _grad_input = layer.backward(&input, &grad_output)?;
    /// // Update parameters
    /// layer.update(0.01)?; // learning rate = 0.01
    fn update(&mut self, learning_rate: F) -> Result<()>;
    /// Get the layer as a dyn Any for downcasting
    /// This method enables runtime type checking and downcasting to specific
    /// layer types when needed.
    fn as_any(&self) -> &dyn std::any::Any;
    /// Get the layer as a mutable dyn Any for downcasting
    /// layer types when mutable access is needed.
    fn as_any_mut(&mut self) -> &mut dyn std::any::Any;
    /// Get the parameters of the layer
    /// Returns all trainable parameters (weights, biases) as a vector of arrays.
    /// Default implementation returns empty vector for parameterless layers.
    /// let params = layer.params();
    /// // Dense layer has weights and biases
    /// assert_eq!(params.len(), 2);
    fn params(&self) -> Vec<Array<F, ndarray::IxDyn>> {
        Vec::new()
    }
    /// Get the gradients of the layer parameters
    /// Returns gradients for all trainable parameters. Must be called after
    /// backward pass to get meaningful values.
    fn gradients(&self) -> Vec<Array<F, ndarray::IxDyn>> {
        Vec::new()
    }
    /// Set the gradients of the layer parameters
    /// Used by optimizers to set computed gradients. Default implementation
    /// does nothing for parameterless layers.
    fn set_gradients(&mut self, _gradients: &[Array<F, ndarray::IxDyn>]) -> Result<()> {
        Ok(())
    }
    /// Set the parameters of the layer
    /// Used for loading pre-trained weights or applying parameter updates.
    /// Default implementation does nothing for parameterless layers.
    fn set_params(&mut self, _params: &[Array<F, ndarray::IxDyn>]) -> Result<()> {
        Ok(())
    }
    /// Set the layer to training mode (true) or evaluation mode (false)
    /// Training mode enables features like dropout and batch normalization
    /// parameter updates. Evaluation mode disables these features for inference.
    /// use scirs2_neural::layers::{Layer, Dropout};
    /// let mut dropout = Dropout::<f32>::new(0.5, &mut rng).unwrap();
    /// assert!(dropout.is_training()); // Default is training mode
    /// dropout.set_training(false); // Switch to evaluation
    /// assert!(!dropout.is_training());
    fn set_training(&mut self, _training: bool) {
        // Default implementation: do nothing
    }
    /// Get the current training mode
    /// Returns true if layer is in training mode, false if in evaluation mode.
    fn is_training(&self) -> bool {
        true // Default implementation: always in training mode
    }
    /// Get the type of the layer (e.g., "Dense", "Conv2D")
    /// Returns a string identifier for the layer type, useful for debugging
    /// and model introspection.
    fn layer_type(&self) -> &str {
        "Unknown"
    }
    /// Get the number of trainable parameters in this layer
    /// Returns the total count of all trainable parameters (weights, biases, etc.).
    /// Useful for model analysis and memory estimation.
    fn parameter_count(&self) -> usize {
        0
    }
    /// Get a detailed description of this layer
    /// Returns a human-readable description including layer type and key properties.
    /// Can be overridden for more detailed layer-specific information.
    fn layer_description(&self) -> String {
        format!("type:{}", self.layer_type())
}
/// Trait for layers with parameters (weights, biases)
pub trait ParamLayer<F: Float + Debug + ScalarOperand>: Layer<F> {
    /// Get the parameters of the layer as a vector of arrays
    fn get_parameters(&self) -> Vec<&Array<F, ndarray::IxDyn>>;
    /// Get the gradients of the parameters
    fn get_gradients(&self) -> Vec<&Array<F, ndarray::IxDyn>>;
    fn set_parameters(&mut self, params: Vec<Array<F, ndarray::IxDyn>>) -> Result<()>;
}

mod attention;
pub mod conv;
pub mod dense;
mod dropout;
mod embedding;
mod normalization;
pub mod recurrent;
mod regularization;
mod rnn_thread_safe;
// Re-export layer types
pub use attention::{AttentionConfig, AttentionMask, MultiHeadAttention, SelfAttention};
pub use conv::{
    AdaptiveAvgPool1D, AdaptiveAvgPool2D, AdaptiveAvgPool3D, AdaptiveMaxPool1D, AdaptiveMaxPool2D,
    AdaptiveMaxPool3D, Conv2D, GlobalAvgPool2D, MaxPool2D, PaddingMode,
};
pub use dense::Dense;
pub use dropout::Dropout;
pub use embedding::{Embedding, EmbeddingConfig, PatchEmbedding, PositionalEmbedding};
pub use normalization::{BatchNorm, LayerNorm, LayerNorm2D};
pub use recurrent::{
    Bidirectional, GRUConfig, LSTMConfig, RNNConfig, RecurrentActivation, GRU, LSTM, RNN,
pub use regularization::{
    ActivityRegularization, L1ActivityRegularization, L2ActivityRegularization,
pub use rnn_thread_safe::{
    RecurrentActivation as ThreadSafeRecurrentActivation, ThreadSafeBidirectional, ThreadSafeRNN,
// Configuration types
/// Configuration enum for different types of layers
#[derive(Debug, Clone)]
pub enum LayerConfig {
    /// Dense (fully connected) layer
    Dense,
    /// 2D Convolutional layer
    Conv2D,
    /// Recurrent Neural Network layer
    RNN,
    /// Long Short-Term Memory layer
    LSTM,
    /// Gated Recurrent Unit layer
    GRU,
    // Add other layer types as needed
/// Sequential container for neural network layers
/// A Sequential model is a linear stack of layers where data flows through
/// each layer in order. This is the most common way to build neural networks
/// and is suitable for feed-forward architectures.
/// # Features
/// - **Linear topology**: Layers are executed in the order they were added
/// - **Automatic gradient flow**: Backward pass automatically chains through all layers
/// - **Training mode management**: Sets all contained layers to training/evaluation mode
/// - **Parameter aggregation**: Collects parameters from all layers for optimization
/// - **Memory efficient**: Reuses intermediate tensors when possible
/// ## Building a Classifier
/// use scirs2_neural::layers::{Dense, Dropout, Layer};
/// use scirs2_neural::models::{Sequential, Model};
/// let mut model: Sequential<f32> = Sequential::new();
/// // Build a 3-layer classifier for MNIST (28x28 = 784 inputs, 10 classes)
/// model.add_layer(Dense::<f32>::new(784, 128, Some("relu"), &mut rng)?);
/// model.add_layer(Dropout::new(0.3, &mut rng)?);
/// model.add_layer(Dense::new(128, 64, Some("relu"), &mut rng)?);
/// model.add_layer(Dense::<f32>::new(64, 10, Some("softmax"), &mut rng)?);
/// // Process a batch of images
/// let batch = Array::zeros((32, 784)).into_dyn(); // 32 samples
/// let predictions = model.forward(&batch)?;
/// assert_eq!(predictions.shape(), &[32, 10]);
/// println!("Model summary:");
/// println!("- Layers: {}", model.num_layers());
/// ## CNN for Image Recognition
/// use scirs2_neural::layers::{Conv2D, MaxPool2D, Dense, Dropout, Layer, PaddingMode};
/// let mut cnn: Sequential<f32> = Sequential::new();
/// // Convolutional feature extractor
/// cnn.add_layer(Conv2D::new(3, 32, (3, 3), (1, 1), PaddingMode::Same, &mut rng)?); // 3->32 channels
/// cnn.add_layer(MaxPool2D::new((2, 2), (2, 2), None)?); // Downsample 2x
/// cnn.add_layer(Conv2D::new(32, 64, (3, 3), (1, 1), PaddingMode::Same, &mut rng)?); // 32->64 channels  
/// // Classifier head (would need reshape layer in practice)
/// // cnn.add_layer(Flatten::new()); // Would flatten to 1D
/// // cnn.add_layer(Dense::new(64*8*8, 128, Some("relu"), &mut rng)?);
/// // cnn.add_layer(Dropout::new(0.5, &mut rng)?);
/// // cnn.add_layer(Dense::new(128, 10, None, &mut rng)?);
/// // Input: batch of 32x32 RGB images
/// let images = Array::zeros((16, 3, 32, 32)).into_dyn();
/// let features = cnn.forward(&images)?;
/// println!("Feature shape: {:?}", features.shape());
/// ## Training and Evaluation Modes
/// model.add_layer(Dense::new(10, 5, Some("relu"), &mut rng)?);
/// model.add_layer(Dropout::new(0.5, &mut rng)?); // 50% dropout
/// model.add_layer(Dense::<f32>::new(5, 1, None, &mut rng)?);
/// let input = Array::ones((4, 10)).into_dyn();
/// // Forward pass through the model
/// let output = model.forward(&input)?;
/// println!("Output shape: {:?}", output.shape());
pub struct Sequential<F: Float + Debug + ScalarOperand> {
    layers: Vec<Box<dyn Layer<F> + Send + Sync>>,
    training: bool,
impl<F: Float + Debug + ScalarOperand> std::fmt::Debug for Sequential<F> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("Sequential")
            .field("num_layers", &self.layers.len())
            .field("training", &self.training)
            .finish()
// We can't clone trait objects directly
// This is a minimal implementation that won't clone the actual layers
impl<F: Float + Debug + ScalarOperand + 'static> Clone for Sequential<F> {
    fn clone(&self) -> Self {
        // We can't clone the layers, so we just create an empty Sequential
        // with the same training flag
        Self {
            layers: Vec::new(),
            training: self.training,
        }
impl<F: Float + Debug + ScalarOperand> Default for Sequential<F> {
    fn default() -> Self {
        Self::new()
impl<F: Float + Debug + ScalarOperand> Sequential<F> {
    /// Create a new Sequential container
    pub fn new() -> Self {
            training: true,
    /// Add a layer to the container
    pub fn add<L: Layer<F> + Send + Sync + 'static>(&mut self, layer: L) {
        self.layers.push(Box::new(layer));
    /// Get the number of layers
    pub fn len(&self) -> usize {
        self.layers.len()
    /// Check if there are no layers
    pub fn is_empty(&self) -> bool {
        self.layers.is_empty()
impl<F: Float + Debug + ScalarOperand> Layer<F> for Sequential<F> {
    fn forward(&self, input: &Array<F, ndarray::IxDyn>) -> Result<Array<F, ndarray::IxDyn>> {
        let mut output = input.clone();
        for layer in &self.layers {
            output = layer.forward(&output)?;
        Ok(output)
        _input: &Array<F, ndarray::IxDyn>,
    ) -> Result<Array<F, ndarray::IxDyn>> {
        // For simplicity, we'll just return the grad_output as-is
        // A real implementation would propagate through the layers in reverse
        Ok(grad_output.clone())
    fn update(&mut self, learning_rate: F) -> Result<()> {
        for layer in &mut self.layers {
            layer.update(learning_rate)?;
        let mut params = Vec::new();
            params.extend(layer.params());
        params
    fn set_training(&mut self, training: bool) {
        self.training = training;
            layer.set_training(training);
        self.training
    fn as_any(&self) -> &dyn std::any::Any {
        self
    fn as_any_mut(&mut self) -> &mut dyn std::any::Any {
impl<F: Float + Debug + ScalarOperand + 'static> ParamLayer<F> for Sequential<F> {
    fn get_parameters(&self) -> Vec<&Array<F, ndarray::IxDyn>> {
            // Try to downcast to ParamLayer to get parameters
            if let Some(param_layer) = layer
                .as_any()
                .downcast_ref::<Box<dyn ParamLayer<F> + Send + Sync>>()
            {
                params.extend(param_layer.get_parameters());
            }
    fn get_gradients(&self) -> Vec<&Array<F, ndarray::IxDyn>> {
        let mut gradients = Vec::new();
            // Try to downcast to ParamLayer to get gradients
                gradients.extend(param_layer.get_gradients());
        gradients
    fn set_parameters(&mut self, mut params: Vec<Array<F, ndarray::IxDyn>>) -> Result<()> {
        let mut param_index = 0;
            // Try to downcast to ParamLayer to set parameters
                .as_any_mut()
                .downcast_mut::<Box<dyn ParamLayer<F> + Send + Sync>>()
                let layer_param_count = param_layer.get_parameters().len();
                if param_index + layer_param_count <= params.len() {
                    let layer_params = params
                        .drain(param_index..param_index + layer_param_count)
                        .collect();
                    param_layer.set_parameters(layer_params)?;
                    param_index += layer_param_count;
                }