1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
use crate as burn;

use crate::config::Config;
use crate::module::Module;
use crate::module::Param;
use crate::tensor::backend::Backend;
use crate::tensor::Tensor;

/// Configuration to create a [LayerNorm](LayerNorm) layer.
#[derive(Config)]
pub struct LayerNormConfig {
    /// The size of the input features.
    pub d_model: usize,
    /// A value required for numerical stability. Default: 1e-5
    #[config(default = 1e-5)]
    pub epsilon: f64,
}

/// Applies Layer Normalization over an input tensor as described in the paper [Layer Normalization](https://arxiv.org/abs/1607.06450).
///
/// `Y = norm(X) * γ + β`
#[derive(Module, Debug)]
pub struct LayerNorm<B: Backend> {
    gamma: Param<Tensor<B, 1>>,
    beta: Param<Tensor<B, 1>>,
    epsilon: f64,
}

impl LayerNormConfig {
    /// Initialize a new [layer norm](LayerNorm) module.
    pub fn init<B: Backend>(&self) -> LayerNorm<B> {
        let gamma = Tensor::ones([self.d_model]);
        let beta = Tensor::zeros([self.d_model]);

        LayerNorm {
            gamma: Param::from(gamma),
            beta: Param::from(beta),
            epsilon: self.epsilon,
        }
    }

    /// Initialize a new [layer norm](LayerNorm) module with a [record](LayerNormRecord).
    pub fn init_with<B: Backend>(&self, record: LayerNormRecord<B>) -> LayerNorm<B> {
        LayerNorm {
            gamma: record.gamma,
            beta: record.beta,
            epsilon: self.epsilon,
        }
    }
}

impl<B: Backend> LayerNorm<B> {
    /// Applies the forward pass on the input tensor.
    ///
    /// # Shapes
    ///
    /// - input: `[..., any, d_model]`
    /// - output: `[..., any, d_model]`
    pub fn forward<const D: usize>(&self, input: Tensor<B, D>) -> Tensor<B, D> {
        let (var, mean) = input.clone().var_mean_bias(D - 1);

        let input_normalized = input.sub(mean).div(var.sqrt().add_scalar(self.epsilon));

        input_normalized
            .mul(self.gamma.val().unsqueeze())
            .add(self.beta.val().unsqueeze())
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use burn_tensor::Data;

    #[cfg(feature = "std")]
    use crate::{TestADBackend, TestBackend};

    #[cfg(not(feature = "std"))]
    use crate::TestBackend;

    #[test]
    fn layer_norm_forward() {
        let module = LayerNormConfig::new(10).init::<TestBackend>();
        let input = Tensor::from_data(Data::from([[
            -0.6897, -2.7106, 2.2222, -1.0330, -0.8933, 1.1765, 0.0601, 1.5252, -0.3630, 0.6728,
        ]]));

        let output = module.forward(input);

        output.to_data().assert_approx_eq(
            &Data::from([[
                -0.4990, -1.9680, 1.6178, -0.7486, -0.6470, 0.8576, 0.0461, 1.1111, -0.2614, 0.4915,
            ]]),
            3,
        );
    }

    #[cfg(feature = "std")]
    #[test]
    fn layer_norm_backward() {
        let module = LayerNormConfig::new(2).init::<TestADBackend>();
        let tensor_1 = Tensor::<TestADBackend, 2>::from_data(Data::from([[0.0, 1.0], [3.0, 4.0]]))
            .require_grad();
        let tensor_2 = Tensor::<TestADBackend, 2>::from_data(Data::from([[6.0, 7.0], [9.0, 10.0]]))
            .require_grad();

        let x = tensor_1.clone().matmul(tensor_2.clone());

        let output = module.forward(x);
        let grads = output.backward();

        let tensor_1_grad = tensor_1.grad(&grads).unwrap();
        let tensor_2_grad = tensor_2.grad(&grads).unwrap();
        let gamma_grad = module.gamma.grad(&grads).unwrap();
        let beta_grad = module.beta.grad(&grads).unwrap();

        gamma_grad
            .to_data()
            .assert_approx_eq(&Data::from([-2.0, 2.0]), 3);
        beta_grad
            .to_data()
            .assert_approx_eq(&Data::from([2.0, 2.0]), 3);
        tensor_1_grad
            .to_data()
            .assert_approx_eq(&Data::zeros(tensor_1_grad.shape()), 3);
        tensor_2_grad
            .to_data()
            .assert_approx_eq(&Data::zeros(tensor_2_grad.shape()), 3);
    }
}