scirs2-neural 0.2.0

Neural network building blocks module for SciRS2 (scirs2-neural) - Minimal Version
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
//! Neural network layers implementation
//!
//! This module provides implementations of various neural network layers
//! such as dense (fully connected), attention, convolution, pooling, etc.
//! Layers are the fundamental building blocks of neural networks.
//! # Overview
//! Neural network layers transform input data through learned parameters (weights and biases).
//! Each layer implements the `Layer` trait, which defines the interface for forward and
//! backward propagation, parameter management, and training/evaluation modes.
//! # Available Layer Types
//! ## Core Layers
//! - **Dense**: Fully connected linear transformation
//! - **Conv2D**: 2D convolutional layers for image processing
//! - **Embedding**: Lookup tables for discrete inputs (words, tokens)
//! ## Activation & Regularization
//! - **Dropout**: Randomly sets inputs to zero during training
//! - **BatchNorm/LayerNorm**: Normalization for stable training
//! - **ActivityRegularization**: L1/L2 penalties on activations
//! ## Pooling & Reshaping
//! - **MaxPool2D/AdaptiveMaxPool2D**: Spatial downsampling
//! - **GlobalAvgPool2D**: Global spatial average pooling
//! ## Attention & Sequence
//! - **MultiHeadAttention**: Transformer-style attention mechanism
//! - **LSTM/GRU**: Recurrent layers for sequences
//! - **Bidirectional**: Wrapper for bidirectional RNNs
//! ## Embedding & Positional
//! - **PositionalEmbedding**: Learned positional encodings
//! - **PatchEmbedding**: Convert image patches to embeddings
//! # Examples
//! ## Creating a Simple Dense Layer
//! ```rust
//! use scirs2_neural::layers::{Layer, Dense};
//! use ndarray::Array;
//! use rand::rngs::SmallRng;
//! use rand::SeedableRng;
//! # fn example() -> scirs2_neural::error::Result<()> {
//! let mut rng = rand::rng();
//! // Create a dense layer: 784 inputs -> 128 outputs with ReLU activation
//! let dense = Dense::<f64>::new(784, 128, Some("relu"), &mut rng)?;
//! // Create input batch (batch_size=2, features=784)
//! let input = Array::zeros((2, 784)).into_dyn();
//! // Forward pass
//! let output = dense.forward(&input)?;
//! assert_eq!(output.shape(), &[2, 128]);
//! println!("Layer type: {}", dense.layer_type());
//! println!("Parameters: {}", dense.parameter_count());
//! # Ok(())
//! # }
//! ```
//! ## Building a Sequential Model
//! use scirs2_neural::layers::{Layer, Dense, Dropout};
//! use scirs2_neural::models::{Sequential, Model};
//! let mut model: Sequential<f32> = Sequential::new();
//! // Build a multi-layer network
//! model.add_layer(Dense::<f32>::new(784, 512, Some("relu"), &mut rng)?);
//! model.add_layer(Dropout::<f32>::new(0.2, &mut rng)?);
//! model.add_layer(Dense::<f32>::new(512, 256, Some("relu"), &mut rng)?);
//! model.add_layer(Dense::<f32>::new(256, 10, Some("softmax"), &mut rng)?);
//! // Input: batch of MNIST-like images (batch_size=32, flattened=784)
//! let input = Array::zeros((32, 784)).into_dyn();
//! // Forward pass through entire model
//! let output = model.forward(&input)?;
//! assert_eq!(output.shape(), &[32, 10]); // 10-class predictions
//! println!("Model has {} layers", model.num_layers());
//! let total_params: usize = model.layers().iter().map(|l| l.parameter_count()).sum();
//! println!("Total parameters: {}", total_params);
//! ## Using Convolutional Layers
//! use scirs2_neural::layers::{Layer, Conv2D, MaxPool2D, PaddingMode};
//! // Create conv layer: 3 input channels -> 32 output channels, 3x3 kernel
//! let conv = Conv2D::<f64>::new(3, 32, (3, 3), (1, 1), PaddingMode::Same, &mut rng)?;
//! let pool = MaxPool2D::<f64>::new((2, 2), (2, 2), None)?; // 2x2 max pooling
//! // Input: batch of RGB images (batch=4, channels=3, height=32, width=32)
//! let input = Array::zeros((4, 3, 32, 32)).into_dyn();
//! // Apply convolution then pooling
//! let conv_out = conv.forward(&input)?;
//! assert_eq!(conv_out.shape(), &[4, 32, 32, 32]); // Same padding preserved size
//! let pool_out = pool.forward(&conv_out)?;
//! assert_eq!(pool_out.shape(), &[4, 32, 16, 16]); // Pooling halved spatial dims
//! ## Training vs Evaluation Mode
//! use scirs2_neural::layers::{Layer, Dropout, BatchNorm};
//! let dropout = Dropout::<f64>::new(0.5, &mut rng)?;
//! let mut batchnorm = BatchNorm::<f64>::new(128, 0.9, 1e-5, &mut rng)?;
//! let input = Array::ones((10, 128)).into_dyn();
//! // Training mode (default)
//! assert!(dropout.is_training());
//! let train_output = dropout.forward(&input)?;
//! // Some outputs will be zero due to dropout
//! // Switch to evaluation mode (dropout is immutable in this example)
//! batchnorm.set_training(false);
//! let eval_output = dropout.forward(&input)?;
//! // No dropout applied, all outputs preserved but scaled
//! ## Custom Layer Implementation
//! use scirs2_neural::layers::Layer;
//! use scirs2_neural::error::Result;
//! use ndarray::{Array, ArrayD, ScalarOperand};
//! use num_traits::Float;
//! use std::fmt::Debug;
//! // Custom activation layer that squares the input
//! struct SquareLayer;
//! impl<F: Float + Debug + ScalarOperand> Layer<F> for SquareLayer {
//!     fn forward(&self, input: &ArrayD<F>) -> Result<ArrayD<F>> {
//!         Ok(input.mapv(|x| x * x))
//!     }
//!     fn backward(&self, input: &ArrayD<F>, grad_output: &ArrayD<F>) -> Result<ArrayD<F>> {
//!         // Derivative of x^2 is 2x
//!         Ok(grad_output * &input.mapv(|x| x + x))
//!     fn update(&mut self, _learning_rate: F) -> Result<()> {
//!         Ok(()) // No parameters to update
//!     fn as_any(&self) -> &dyn std::any::Any { self }
//!     fn as_any_mut(&mut self) -> &mut dyn std::any::Any { self }
//!     fn layer_type(&self) -> &str { "Square" }
//! }
//! # Layer Design Patterns
//! ## Parameter Initialization
//! Most layers use random number generators for weight initialization:
//! - **Xavier/Glorot**: Good for tanh/sigmoid activations
//! - **He/Kaiming**: Better for ReLU activations
//! - **Random Normal**: Simple baseline
//! ## Memory Management
//! - Use `set_training(false)` during inference to disable dropout and enable batch norm inference
//! - Sequential containers manage memory efficiently by reusing intermediate buffers
//! - Large models benefit from gradient checkpointing (available in memory_efficient module)
//! ## Gradient Flow
//! - Always implement both `forward` and `backward` methods
//! - The `backward` method should compute gradients w.r.t. inputs and update internal parameter gradients
//! - Use `update` method to apply gradients with learning rate

use crate::error::Result;
use rand::rng;
use ndarray::{Array, ScalarOperand};
use num_traits::Float;
use std::fmt::Debug;
/// Base trait for neural network layers
///
/// This trait defines the core interface that all neural network layers must implement.
/// It supports forward propagation, backpropagation, parameter management, and
/// training/evaluation mode switching.
/// # Core Methods
/// - `forward`: Compute layer output given input
/// - `backward`: Compute gradients for backpropagation  
/// - `update`: Apply parameter updates using computed gradients
/// - `set_training`/`is_training`: Control training vs evaluation behavior
/// # Examples
/// ```rust
/// use scirs2_neural::layers::{Layer, Dense};
/// use ndarray::Array;
/// use rand::rngs::SmallRng;
/// use rand::SeedableRng;
/// # fn example() -> scirs2_neural::error::Result<()> {
/// let mut rng = rand::rng();
/// let mut layer = Dense::<f64>::new(10, 5, None, &mut rng)?;
/// let input = Array::zeros((2, 10)).into_dyn();
/// let output = layer.forward(&input)?;
/// assert_eq!(output.shape(), &[2, 5]);
/// // Check layer properties
/// println!("Layer type: {}", layer.layer_type());
/// println!("Parameter count: {}", layer.parameter_count());
/// println!("Training mode: {}", layer.is_training());
/// # Ok(())
/// # }
/// ```
pub trait Layer<F: Float + Debug + ScalarOperand>: Send + Sync {
    /// Forward pass of the layer
    ///
    /// Computes the output of the layer given an input tensor. This method
    /// applies the layer's transformation (e.g., linear transformation, convolution,
    /// activation function) to the input.
    /// # Arguments
    /// * `input` - Input tensor with arbitrary dimensions
    /// # Returns
    /// Output tensor after applying the layer's transformation
    /// # Examples
    /// ```rust
    /// use scirs2_neural::layers::{Layer, Dense};
    /// use ndarray::Array;
    /// use rand::rngs::SmallRng;
    /// use rand::SeedableRng;
    /// # fn example() -> scirs2_neural::error::Result<()> {
    /// let mut rng = rand::rng();
    /// let layer = Dense::<f64>::new(3, 2, Some("relu"), &mut rng)?;
    /// let input = Array::from_shape_vec((1, 3), vec![1.0, 2.0, 3.0])?.into_dyn();
    /// let output = layer.forward(&input)?;
    /// assert_eq!(output.shape(), &[1, 2]);
    /// # Ok(())
    /// # }
    /// ```
    fn forward(&self, input: &Array<F, ndarray::IxDyn>) -> Result<Array<F, ndarray::IxDyn>>;
    /// Backward pass of the layer to compute gradients
    /// Computes gradients with respect to the layer's input, which is needed
    /// for backpropagation. This method also typically updates the layer's
    /// internal parameter gradients.
    /// * `input` - Original input to the forward pass
    /// * `grad_output` - Gradient of loss with respect to this layer's output
    /// Gradient of loss with respect to this layer's input
    /// let layer = Dense::<f64>::new(3, 2, None, &mut rng)?;
    /// let input = Array::zeros((1, 3)).into_dyn();
    /// let grad_output = Array::ones((1, 2)).into_dyn();
    /// let grad_input = layer.backward(&input, &grad_output)?;
    /// assert_eq!(grad_input.shape(), input.shape());
    fn backward(
        &self,
        input: &Array<F, ndarray::IxDyn>,
        grad_output: &Array<F, ndarray::IxDyn>,
    ) -> Result<Array<F, ndarray::IxDyn>>;
    /// Update the layer parameters with the given gradients
    /// Applies parameter updates using the provided learning rate and the
    /// gradients computed during the backward pass. This is typically called
    /// by optimizers.
    /// * `learning_rate` - Step size for parameter updates
    /// let mut layer = Dense::<f64>::new(3, 2, None, &mut rng)?;
    /// // Simulate forward/backward pass
    /// let _grad_input = layer.backward(&input, &grad_output)?;
    /// // Update parameters
    /// layer.update(0.01)?; // learning rate = 0.01
    fn update(&mut self, learning_rate: F) -> Result<()>;
    /// Get the layer as a dyn Any for downcasting
    /// This method enables runtime type checking and downcasting to specific
    /// layer types when needed.
    fn as_any(&self) -> &dyn std::any::Any;
    /// Get the layer as a mutable dyn Any for downcasting
    /// layer types when mutable access is needed.
    fn as_any_mut(&mut self) -> &mut dyn std::any::Any;
    /// Get the parameters of the layer
    /// Returns all trainable parameters (weights, biases) as a vector of arrays.
    /// Default implementation returns empty vector for parameterless layers.
    /// let params = layer.params();
    /// // Dense layer has weights and biases
    /// assert_eq!(params.len(), 2);
    fn params(&self) -> Vec<Array<F, ndarray::IxDyn>> {
        Vec::new()
    }
    /// Get the gradients of the layer parameters
    /// Returns gradients for all trainable parameters. Must be called after
    /// backward pass to get meaningful values.
    fn gradients(&self) -> Vec<Array<F, ndarray::IxDyn>> {
        Vec::new()
    }
    /// Set the gradients of the layer parameters
    /// Used by optimizers to set computed gradients. Default implementation
    /// does nothing for parameterless layers.
    fn set_gradients(&mut self, _gradients: &[Array<F, ndarray::IxDyn>]) -> Result<()> {
        Ok(())
    }
    /// Set the parameters of the layer
    /// Used for loading pre-trained weights or applying parameter updates.
    /// Default implementation does nothing for parameterless layers.
    fn set_params(&mut self, _params: &[Array<F, ndarray::IxDyn>]) -> Result<()> {
        Ok(())
    }
    /// Set the layer to training mode (true) or evaluation mode (false)
    /// Training mode enables features like dropout and batch normalization
    /// parameter updates. Evaluation mode disables these features for inference.
    /// use scirs2_neural::layers::{Layer, Dropout};
    /// let mut dropout = Dropout::<f32>::new(0.5, &mut rng).unwrap();
    /// assert!(dropout.is_training()); // Default is training mode
    /// dropout.set_training(false); // Switch to evaluation
    /// assert!(!dropout.is_training());
    fn set_training(&mut self, _training: bool) {
        // Default implementation: do nothing
    }
    /// Get the current training mode
    /// Returns true if layer is in training mode, false if in evaluation mode.
    fn is_training(&self) -> bool {
        true // Default implementation: always in training mode
    }
    /// Get the type of the layer (e.g., "Dense", "Conv2D")
    /// Returns a string identifier for the layer type, useful for debugging
    /// and model introspection.
    fn layer_type(&self) -> &str {
        "Unknown"
    }
    /// Get the number of trainable parameters in this layer
    /// Returns the total count of all trainable parameters (weights, biases, etc.).
    /// Useful for model analysis and memory estimation.
    fn parameter_count(&self) -> usize {
        0
    }
    /// Get a detailed description of this layer
    /// Returns a human-readable description including layer type and key properties.
    /// Can be overridden for more detailed layer-specific information.
    fn layer_description(&self) -> String {
        format!("type:{}", self.layer_type())
}
/// Trait for layers with parameters (weights, biases)
pub trait ParamLayer<F: Float + Debug + ScalarOperand>: Layer<F> {
    /// Get the parameters of the layer as a vector of arrays
    fn get_parameters(&self) -> Vec<&Array<F, ndarray::IxDyn>>;
    /// Get the gradients of the parameters
    fn get_gradients(&self) -> Vec<&Array<F, ndarray::IxDyn>>;
    fn set_parameters(&mut self, params: Vec<Array<F, ndarray::IxDyn>>) -> Result<()>;
}

mod attention;
pub mod conv;
pub mod dense;
mod dropout;
mod embedding;
mod normalization;
pub mod recurrent;
mod regularization;
mod rnn_thread_safe;
// Re-export layer types
pub use attention::{AttentionConfig, AttentionMask, MultiHeadAttention, SelfAttention};
pub use conv::{
    AdaptiveAvgPool1D, AdaptiveAvgPool2D, AdaptiveAvgPool3D, AdaptiveMaxPool1D, AdaptiveMaxPool2D,
    AdaptiveMaxPool3D, Conv2D, GlobalAvgPool2D, MaxPool2D, PaddingMode,
};
pub use dense::Dense;
pub use dropout::Dropout;
pub use embedding::{Embedding, EmbeddingConfig, PatchEmbedding, PositionalEmbedding};
pub use normalization::{BatchNorm, LayerNorm, LayerNorm2D};
pub use recurrent::{
    Bidirectional, GRUConfig, LSTMConfig, RNNConfig, RecurrentActivation, GRU, LSTM, RNN,
pub use regularization::{
    ActivityRegularization, L1ActivityRegularization, L2ActivityRegularization,
pub use rnn_thread_safe::{
    RecurrentActivation as ThreadSafeRecurrentActivation, ThreadSafeBidirectional, ThreadSafeRNN,
// Configuration types
/// Configuration enum for different types of layers
#[derive(Debug, Clone)]
pub enum LayerConfig {
    /// Dense (fully connected) layer
    Dense,
    /// 2D Convolutional layer
    Conv2D,
    /// Recurrent Neural Network layer
    RNN,
    /// Long Short-Term Memory layer
    LSTM,
    /// Gated Recurrent Unit layer
    GRU,
    // Add other layer types as needed
/// Sequential container for neural network layers
/// A Sequential model is a linear stack of layers where data flows through
/// each layer in order. This is the most common way to build neural networks
/// and is suitable for feed-forward architectures.
/// # Features
/// - **Linear topology**: Layers are executed in the order they were added
/// - **Automatic gradient flow**: Backward pass automatically chains through all layers
/// - **Training mode management**: Sets all contained layers to training/evaluation mode
/// - **Parameter aggregation**: Collects parameters from all layers for optimization
/// - **Memory efficient**: Reuses intermediate tensors when possible
/// ## Building a Classifier
/// use scirs2_neural::layers::{Dense, Dropout, Layer};
/// use scirs2_neural::models::{Sequential, Model};
/// let mut model: Sequential<f32> = Sequential::new();
/// // Build a 3-layer classifier for MNIST (28x28 = 784 inputs, 10 classes)
/// model.add_layer(Dense::<f32>::new(784, 128, Some("relu"), &mut rng)?);
/// model.add_layer(Dropout::new(0.3, &mut rng)?);
/// model.add_layer(Dense::new(128, 64, Some("relu"), &mut rng)?);
/// model.add_layer(Dense::<f32>::new(64, 10, Some("softmax"), &mut rng)?);
/// // Process a batch of images
/// let batch = Array::zeros((32, 784)).into_dyn(); // 32 samples
/// let predictions = model.forward(&batch)?;
/// assert_eq!(predictions.shape(), &[32, 10]);
/// println!("Model summary:");
/// println!("- Layers: {}", model.num_layers());
/// ## CNN for Image Recognition
/// use scirs2_neural::layers::{Conv2D, MaxPool2D, Dense, Dropout, Layer, PaddingMode};
/// let mut cnn: Sequential<f32> = Sequential::new();
/// // Convolutional feature extractor
/// cnn.add_layer(Conv2D::new(3, 32, (3, 3), (1, 1), PaddingMode::Same, &mut rng)?); // 3->32 channels
/// cnn.add_layer(MaxPool2D::new((2, 2), (2, 2), None)?); // Downsample 2x
/// cnn.add_layer(Conv2D::new(32, 64, (3, 3), (1, 1), PaddingMode::Same, &mut rng)?); // 32->64 channels  
/// // Classifier head (would need reshape layer in practice)
/// // cnn.add_layer(Flatten::new()); // Would flatten to 1D
/// // cnn.add_layer(Dense::new(64*8*8, 128, Some("relu"), &mut rng)?);
/// // cnn.add_layer(Dropout::new(0.5, &mut rng)?);
/// // cnn.add_layer(Dense::new(128, 10, None, &mut rng)?);
/// // Input: batch of 32x32 RGB images
/// let images = Array::zeros((16, 3, 32, 32)).into_dyn();
/// let features = cnn.forward(&images)?;
/// println!("Feature shape: {:?}", features.shape());
/// ## Training and Evaluation Modes
/// model.add_layer(Dense::new(10, 5, Some("relu"), &mut rng)?);
/// model.add_layer(Dropout::new(0.5, &mut rng)?); // 50% dropout
/// model.add_layer(Dense::<f32>::new(5, 1, None, &mut rng)?);
/// let input = Array::ones((4, 10)).into_dyn();
/// // Forward pass through the model
/// let output = model.forward(&input)?;
/// println!("Output shape: {:?}", output.shape());
pub struct Sequential<F: Float + Debug + ScalarOperand> {
    layers: Vec<Box<dyn Layer<F> + Send + Sync>>,
    training: bool,
impl<F: Float + Debug + ScalarOperand> std::fmt::Debug for Sequential<F> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("Sequential")
            .field("num_layers", &self.layers.len())
            .field("training", &self.training)
            .finish()
// We can't clone trait objects directly
// This is a minimal implementation that won't clone the actual layers
impl<F: Float + Debug + ScalarOperand + 'static> Clone for Sequential<F> {
    fn clone(&self) -> Self {
        // We can't clone the layers, so we just create an empty Sequential
        // with the same training flag
        Self {
            layers: Vec::new(),
            training: self.training,
        }
impl<F: Float + Debug + ScalarOperand> Default for Sequential<F> {
    fn default() -> Self {
        Self::new()
impl<F: Float + Debug + ScalarOperand> Sequential<F> {
    /// Create a new Sequential container
    pub fn new() -> Self {
            training: true,
    /// Add a layer to the container
    pub fn add<L: Layer<F> + Send + Sync + 'static>(&mut self, layer: L) {
        self.layers.push(Box::new(layer));
    /// Get the number of layers
    pub fn len(&self) -> usize {
        self.layers.len()
    /// Check if there are no layers
    pub fn is_empty(&self) -> bool {
        self.layers.is_empty()
impl<F: Float + Debug + ScalarOperand> Layer<F> for Sequential<F> {
    fn forward(&self, input: &Array<F, ndarray::IxDyn>) -> Result<Array<F, ndarray::IxDyn>> {
        let mut output = input.clone();
        for layer in &self.layers {
            output = layer.forward(&output)?;
        Ok(output)
        _input: &Array<F, ndarray::IxDyn>,
    ) -> Result<Array<F, ndarray::IxDyn>> {
        // For simplicity, we'll just return the grad_output as-is
        // A real implementation would propagate through the layers in reverse
        Ok(grad_output.clone())
    fn update(&mut self, learning_rate: F) -> Result<()> {
        for layer in &mut self.layers {
            layer.update(learning_rate)?;
        let mut params = Vec::new();
            params.extend(layer.params());
        params
    fn set_training(&mut self, training: bool) {
        self.training = training;
            layer.set_training(training);
        self.training
    fn as_any(&self) -> &dyn std::any::Any {
        self
    fn as_any_mut(&mut self) -> &mut dyn std::any::Any {
impl<F: Float + Debug + ScalarOperand + 'static> ParamLayer<F> for Sequential<F> {
    fn get_parameters(&self) -> Vec<&Array<F, ndarray::IxDyn>> {
            // Try to downcast to ParamLayer to get parameters
            if let Some(param_layer) = layer
                .as_any()
                .downcast_ref::<Box<dyn ParamLayer<F> + Send + Sync>>()
            {
                params.extend(param_layer.get_parameters());
            }
    fn get_gradients(&self) -> Vec<&Array<F, ndarray::IxDyn>> {
        let mut gradients = Vec::new();
            // Try to downcast to ParamLayer to get gradients
                gradients.extend(param_layer.get_gradients());
        gradients
    fn set_parameters(&mut self, mut params: Vec<Array<F, ndarray::IxDyn>>) -> Result<()> {
        let mut param_index = 0;
            // Try to downcast to ParamLayer to set parameters
                .as_any_mut()
                .downcast_mut::<Box<dyn ParamLayer<F> + Send + Sync>>()
                let layer_param_count = param_layer.get_parameters().len();
                if param_index + layer_param_count <= params.len() {
                    let layer_params = params
                        .drain(param_index..param_index + layer_param_count)
                        .collect();
                    param_layer.set_parameters(layer_params)?;
                    param_index += layer_param_count;
                }