aprender-core 0.50.0

//! Activation function modules.
//!
//! These modules wrap activation functions for use in Sequential containers.
//! For functional versions, see `nn::functional`.
//!
//! # References
//!
//! - Nair, V., & Hinton, G. E. (2010). Rectified linear units improve restricted
//!   Boltzmann machines. ICML.
//! - He, K., et al. (2015). Delving deep into rectifiers. ICCV.

use super::module::Module;
use crate::autograd::Tensor;

/// Rectified Linear Unit activation: ReLU(x) = max(0, x)
///
/// # Shape
///
/// - Input: `(*)` any shape
/// - Output: `(*)` same shape as input
///
/// # Example
///
/// ```ignore
/// use aprender::nn::{Module, ReLU};
/// use aprender::autograd::Tensor;
///
/// let relu = ReLU::new();
/// let x = Tensor::from_slice(&[-1.0, 0.0, 1.0, 2.0]);
/// let y = relu.forward(&x);  // [0.0, 0.0, 1.0, 2.0]
/// ```
#[derive(Debug, Clone, Copy, Default)]
pub struct ReLU;

impl ReLU {
    /// Create a new `ReLU` activation.
    #[must_use]
    pub fn new() -> Self {
        Self
    }
}

impl Module for ReLU {
    fn forward(&self, input: &Tensor) -> Tensor {
        input.relu()
    }
}

/// Leaky `ReLU` activation: LeakyReLU(x) = `max(negative_slope` * x, x)
///
/// # Arguments
///
/// * `negative_slope` - Controls angle of negative slope (default: 0.01)
#[derive(Debug, Clone, Copy)]
pub struct LeakyReLU {
    negative_slope: f32,
}

impl LeakyReLU {
    /// Create a new `LeakyReLU` with default negative slope (0.01).
    #[must_use]
    pub fn new() -> Self {
        Self {
            negative_slope: 0.01,
        }
    }

    /// Create a new `LeakyReLU` with specified negative slope.
    #[must_use]
    pub fn with_slope(negative_slope: f32) -> Self {
        Self { negative_slope }
    }
}

impl Default for LeakyReLU {
    fn default() -> Self {
        Self::new()
    }
}

impl Module for LeakyReLU {
    fn forward(&self, input: &Tensor) -> Tensor {
        input.leaky_relu(self.negative_slope)
    }
}

/// Sigmoid activation: σ(x) = 1 / (1 + exp(-x))
///
/// Maps inputs to (0, 1) range.
#[derive(Debug, Clone, Copy, Default)]
pub struct Sigmoid;

impl Sigmoid {
    #[must_use]
    pub fn new() -> Self {
        Self
    }
}

impl Module for Sigmoid {
    fn forward(&self, input: &Tensor) -> Tensor {
        input.sigmoid()
    }
}

/// Tanh activation: tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
///
/// Maps inputs to (-1, 1) range.
#[derive(Debug, Clone, Copy, Default)]
pub struct Tanh;

impl Tanh {
    #[must_use]
    pub fn new() -> Self {
        Self
    }
}

impl Module for Tanh {
    fn forward(&self, input: &Tensor) -> Tensor {
        input.tanh_()
    }
}

/// Gaussian Error Linear Unit (GELU) activation.
///
/// GELU(x) = x * Φ(x) where Φ is the CDF of standard normal.
/// Approximation: 0.5 * x * (1 + tanh(sqrt(2/π) * (x + 0.044715 * x³)))
///
/// # Reference
///
/// - Hendrycks, D., & Gimpel, K. (2016). Gaussian Error Linear Units (GELUs).
#[derive(Debug, Clone, Copy, Default)]
pub struct GELU;

impl GELU {
    #[must_use]
    pub fn new() -> Self {
        Self
    }
}

impl Module for GELU {
    fn forward(&self, input: &Tensor) -> Tensor {
        input.gelu()
    }
}

/// Softmax activation: softmax(x)_i = `exp(x_i)` / `Σ_j` `exp(x_j)`
///
/// Converts logits to probabilities that sum to 1 along the configured
/// dimension. Matches the semantics of `torch.nn.Softmax(dim)`.
///
/// # Arguments
///
/// * `dim` - Dimension along which to compute softmax. Negative values index
///   from the end (e.g. `-1` is the last dimension), matching PyTorch.
///
/// # Example
///
/// ```ignore
/// use aprender::nn::{Module, Softmax};
/// use aprender::autograd::Tensor;
///
/// // Column softmax (each column sums to 1), like torch.nn.Softmax(0)
/// let sm = Softmax::new(0);
/// let x = Tensor::new(&[1.0, 2.0, 3.0, 4.0], &[2, 2]);
/// let y = sm.forward(&x); // [[0.1192, 0.1192], [0.8808, 0.8808]]
/// ```
#[derive(Debug, Clone, Copy)]
pub struct Softmax {
    dim: i32,
}

impl Softmax {
    /// Create a new Softmax along the specified dimension.
    #[must_use]
    pub fn new(dim: i32) -> Self {
        Self { dim }
    }
}

impl Default for Softmax {
    fn default() -> Self {
        Self::new(-1)
    }
}

impl Module for Softmax {
    fn forward(&self, input: &Tensor) -> Tensor {
        // Resolve negative dims (PyTorch semantics: -1 == last axis).
        let ndim = input.ndim() as i32;
        let axis = if self.dim < 0 {
            ndim + self.dim
        } else {
            self.dim
        };

        // Fast path: softmax over the last dimension is the canonical
        // (differentiable) `Tensor::softmax()` kernel — leave it untouched.
        if axis == ndim - 1 {
            return input.softmax();
        }

        // Dim-aware 2D path: softmax over a non-last axis. We transpose so the
        // target axis becomes last, run the canonical last-dim softmax, then
        // transpose back. Both `transpose` and `softmax` are autograd ops, so
        // gradients flow correctly through this composition.
        //
        // SCOPE: A general n-d strided softmax over an arbitrary axis is not
        // yet implemented; only 2D non-last (i.e. dim == 0) is supported here.
        // This covers `torch.nn.Softmax(0)` on a 2D tensor, the common case.
        // A higher-rank request would need a strided reduction (future work) —
        // we assert rather than silently softmax the wrong axis.
        assert!(
            ndim == 2 && axis == 0,
            "Softmax: dim={} resolved to axis={} is unsupported for a \
             {ndim}-D tensor; only the last dim or dim=0 on a 2D tensor are \
             implemented",
            self.dim,
            axis,
        );

        input.transpose().softmax().transpose()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_relu() {
        let relu = ReLU::new();
        let x = Tensor::from_slice(&[-2.0, -1.0, 0.0, 1.0, 2.0]);
        let y = relu.forward(&x);

        assert_eq!(y.data(), &[0.0, 0.0, 0.0, 1.0, 2.0]);
    }

    #[test]
    fn test_leaky_relu() {
        let lrelu = LeakyReLU::with_slope(0.1);
        let x = Tensor::from_slice(&[-2.0, -1.0, 0.0, 1.0, 2.0]);
        let y = lrelu.forward(&x);

        assert_eq!(y.data(), &[-0.2, -0.1, 0.0, 1.0, 2.0]);
    }

    #[test]
    fn test_sigmoid() {
        let sigmoid = Sigmoid::new();
        let x = Tensor::from_slice(&[0.0]);
        let y = sigmoid.forward(&x);

        assert!((y.data()[0] - 0.5).abs() < 1e-5);
    }

    #[test]
    fn test_sigmoid_bounds() {
        let sigmoid = Sigmoid::new();
        let x = Tensor::from_slice(&[-10.0, 0.0, 10.0]);
        let y = sigmoid.forward(&x);

        // Should be in (0, 1)
        for &val in y.data() {
            assert!(val > 0.0 && val < 1.0);
        }
    }

    #[test]
    fn test_tanh() {
        let tanh = Tanh::new();
        let x = Tensor::from_slice(&[0.0]);
        let y = tanh.forward(&x);

        assert!((y.data()[0]).abs() < 1e-5);
    }

    #[test]
    fn test_tanh_bounds() {
        let tanh = Tanh::new();
        let x = Tensor::from_slice(&[-2.0, 0.0, 2.0]);
        let y = tanh.forward(&x);

        // Should be in (-1, 1)
        for &val in y.data() {
            assert!((-1.0..=1.0).contains(&val));
        }

        // More specific bounds for non-extreme values
        assert!(y.data()[0] > -1.0 && y.data()[0] < -0.9); // tanh(-2) ≈ -0.964
        assert!(y.data()[2] > 0.9 && y.data()[2] < 1.0); // tanh(2) ≈ 0.964
    }

    #[test]
    fn test_gelu() {
        let gelu = GELU::new();
        let x = Tensor::from_slice(&[0.0]);
        let y = gelu.forward(&x);

        // GELU(0) = 0
        assert!((y.data()[0]).abs() < 1e-5);
    }

    #[test]
    fn test_gelu_positive() {
        let gelu = GELU::new();
        let x = Tensor::from_slice(&[1.0]);
        let y = gelu.forward(&x);

        // GELU(1) ≈ 0.841
        assert!((y.data()[0] - 0.841).abs() < 0.01);
    }

    #[test]
    fn test_softmax_sums_to_one() {
        let softmax = Softmax::new(-1);
        let x = Tensor::new(&[1.0, 2.0, 3.0, 1.0, 2.0, 3.0], &[2, 3]);
        let y = softmax.forward(&x);

        // Each row should sum to 1
        let (batch, features) = (2, 3);
        for b in 0..batch {
            let sum: f32 = (0..features).map(|j| y.data()[b * features + j]).sum();
            assert!((sum - 1.0).abs() < 1e-5, "Row {b} sums to {sum}");
        }
    }

    #[test]
    fn test_softmax_numerical_stability() {
        let softmax = Softmax::new(-1);
        // Large values that could cause overflow without proper handling
        let x = Tensor::new(&[1000.0, 1001.0, 1002.0], &[1, 3]);
        let y = softmax.forward(&x);

        // Should not have NaN or Inf
        for &val in y.data() {
            assert!(val.is_finite());
            assert!((0.0..=1.0).contains(&val));
        }

        // Should still sum to 1
        let sum: f32 = y.data().iter().sum();
        assert!((sum - 1.0).abs() < 1e-5);
    }

    // =========================================================================
    // Additional coverage tests for Default impls and Debug
    // =========================================================================

    #[test]
    fn test_relu_default() {
        let relu = ReLU::default();
        let x = Tensor::from_slice(&[-1.0, 1.0]);
        let y = relu.forward(&x);
        assert_eq!(y.data(), &[0.0, 1.0]);
    }

    #[test]
    fn test_relu_debug_clone_copy() {
        let relu = ReLU::new();
        let debug_str = format!("{:?}", relu);
        assert!(debug_str.contains("ReLU"));

        let cloned = relu.clone();
        let copied = relu;
        let _ = cloned.forward(&Tensor::from_slice(&[1.0]));
        let _ = copied.forward(&Tensor::from_slice(&[1.0]));
    }

    #[test]
    fn test_leaky_relu_default() {
        let lrelu = LeakyReLU::default();
        let x = Tensor::from_slice(&[-100.0]);
        let y = lrelu.forward(&x);
        // Default slope is 0.01
        assert!((y.data()[0] - (-1.0)).abs() < 0.001);
    }

    #[test]
    fn test_leaky_relu_debug_clone_copy() {
        let lrelu = LeakyReLU::new();
        let debug_str = format!("{:?}", lrelu);
        assert!(debug_str.contains("LeakyReLU"));

        let cloned = lrelu.clone();
        let copied = lrelu;
        let _ = cloned.forward(&Tensor::from_slice(&[1.0]));
        let _ = copied.forward(&Tensor::from_slice(&[1.0]));
    }

    #[test]
    fn test_sigmoid_default() {
        let sigmoid = Sigmoid::default();
        let x = Tensor::from_slice(&[0.0]);
        let y = sigmoid.forward(&x);
        assert!((y.data()[0] - 0.5).abs() < 1e-5);
    }

    #[test]
    fn test_sigmoid_debug_clone_copy() {
        let sigmoid = Sigmoid::new();
        let debug_str = format!("{:?}", sigmoid);
        assert!(debug_str.contains("Sigmoid"));

        let cloned = sigmoid.clone();
        let copied = sigmoid;
        let _ = cloned.forward(&Tensor::from_slice(&[0.0]));
        let _ = copied.forward(&Tensor::from_slice(&[0.0]));
    }

    #[test]
    fn test_tanh_default() {
        let tanh = Tanh::default();
        let x = Tensor::from_slice(&[0.0]);
        let y = tanh.forward(&x);
        assert!((y.data()[0]).abs() < 1e-5);
    }

    #[test]
    fn test_tanh_debug_clone_copy() {
        let tanh = Tanh::new();
        let debug_str = format!("{:?}", tanh);
        assert!(debug_str.contains("Tanh"));

        let cloned = tanh.clone();
        let copied = tanh;
        let _ = cloned.forward(&Tensor::from_slice(&[0.0]));
        let _ = copied.forward(&Tensor::from_slice(&[0.0]));
    }

    #[test]
    fn test_gelu_default() {
        let gelu = GELU::default();
        let x = Tensor::from_slice(&[0.0]);
        let y = gelu.forward(&x);
        assert!((y.data()[0]).abs() < 1e-5);
    }

    #[test]
    fn test_gelu_debug_clone_copy() {
        let gelu = GELU::new();
        let debug_str = format!("{:?}", gelu);
        assert!(debug_str.contains("GELU"));

        let cloned = gelu.clone();
        let copied = gelu;
        let _ = cloned.forward(&Tensor::from_slice(&[1.0]));
        let _ = copied.forward(&Tensor::from_slice(&[1.0]));
    }

    #[test]
    fn test_softmax_default() {
        let softmax = Softmax::default(); // dim = -1
        let x = Tensor::new(&[1.0, 2.0, 3.0], &[1, 3]);
        let y = softmax.forward(&x);
        let sum: f32 = y.data().iter().sum();
        assert!((sum - 1.0).abs() < 1e-5);
    }

    #[test]
    fn test_softmax_debug_clone_copy() {
        let softmax = Softmax::new(-1);
        let debug_str = format!("{:?}", softmax);
        assert!(debug_str.contains("Softmax"));

        let cloned = softmax.clone();
        let copied = softmax;
        let _ = cloned.forward(&Tensor::new(&[1.0, 2.0], &[1, 2]));
        let _ = copied.forward(&Tensor::new(&[1.0, 2.0], &[1, 2]));
    }

    // =========================================================================
    // PMAT-867: Softmax must honor `dim` (torch.nn.Softmax(dim) parity).
    //
    // FALSIFIER: `Softmax::new(0)` must softmax over the COLUMNS (axis 0), not
    // silently over the last axis. For [[1,2],[3,4]]:
    //   torch.nn.Softmax(0) -> [[0.1192, 0.1192], [0.8808, 0.8808]]
    //   torch.nn.Softmax(1) -> [[0.2689, 0.7311], [0.2689, 0.7311]]
    // RED (dim-ignored bug): col0 top == 0.2689 (row softmax). GREEN: 0.1192.
    // =========================================================================

    #[test]
    fn test_softmax_dim0_column_pmat867() {
        // [[1, 2], [3, 4]] — column softmax (each column sums to 1).
        let softmax = Softmax::new(0);
        let x = Tensor::new(&[1.0, 2.0, 3.0, 4.0], &[2, 2]);
        let y = softmax.forward(&x);
        let d = y.data();

        // Element [0][0] must be the column-softmax value 0.1192, NOT the
        // row-softmax value 0.2689 the dim-ignored bug produced.
        assert!(
            (d[0] - 0.1192).abs() < 1e-3,
            "Softmax(0)[0][0] = {} (expected 0.1192 column-softmax, \
             0.2689 means dim was ignored)",
            d[0]
        );
        assert!((d[1] - 0.1192).abs() < 1e-3, "[0][1] = {}", d[1]);
        assert!((d[2] - 0.8808).abs() < 1e-3, "[1][0] = {}", d[2]);
        assert!((d[3] - 0.8808).abs() < 1e-3, "[1][1] = {}", d[3]);

        // Each COLUMN must sum to 1 along axis 0.
        let col0 = d[0] + d[2];
        let col1 = d[1] + d[3];
        assert!((col0 - 1.0).abs() < 1e-5, "column 0 sums to {col0}");
        assert!((col1 - 1.0).abs() < 1e-5, "column 1 sums to {col1}");
    }

    #[test]
    fn test_softmax_dim1_and_neg1_row_unchanged_pmat867() {
        // dim = 1 (last axis) and dim = -1 must both be row softmax, unchanged.
        let x = Tensor::new(&[1.0, 2.0, 3.0, 4.0], &[2, 2]);

        for dim in [1, -1] {
            let y = Softmax::new(dim).forward(&x);
            let d = y.data();
            // Row 0 of [1, 2] -> [0.2689, 0.7311]
            assert!(
                (d[0] - 0.2689).abs() < 1e-3,
                "Softmax({dim})[0][0] = {} (expected 0.2689 row-softmax)",
                d[0]
            );
            assert!((d[1] - 0.7311).abs() < 1e-3, "[0][1] = {}", d[1]);
            // Each ROW sums to 1 along axis 1.
            let row0 = d[0] + d[1];
            let row1 = d[2] + d[3];
            assert!((row0 - 1.0).abs() < 1e-5, "row 0 sums to {row0}");
            assert!((row1 - 1.0).abs() < 1e-5, "row 1 sums to {row1}");
        }
    }

    #[test]
    fn test_softmax_dim0_preserves_gradients_pmat867() {
        use crate::autograd::clear_graph;

        // The dim=0 path composes differentiable ops (transpose ∘ softmax ∘
        // transpose); gradients must still flow back to the input.
        clear_graph();
        let x = Tensor::new(&[1.0, 2.0, 3.0, 4.0], &[2, 2]).requires_grad();
        let x_id = x.id();

        // Scalar loss = sum(softmax(x, dim=0)) so backward is well-defined.
        let loss = Softmax::new(0).forward(&x).sum();
        loss.backward();

        let grad = crate::autograd::get_grad(x_id)
            .expect("Softmax(0) forward must keep the input differentiable");
        assert_eq!(grad.numel(), 4, "gradient must cover every input element");
        // sum(softmax) is constant (== n_cols), so d(loss)/d(x) ≈ 0 everywhere.
        for &g in grad.data() {
            assert!(g.abs() < 1e-4, "grad {g} should be ~0 for sum-of-softmax");
        }
    }
}